### Data Cleaning

In [14]:
# Load the DataFrame
import pandas as pd 
colleges_df = pd.read_csv('/Users/christianmoya/Documents/Flatiron/Phase_3/Phase_3_Project/colleges.csv')
colleges_df.drop(columns=['Unnamed: 0'], inplace=True, axis=1)
colleges_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1558 entries, 0 to 1557
Data columns (total 37 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   school_name                           1558 non-null   object 
 1   early_career_pay                      1558 non-null   int64  
 2   mid_career_pay                        1558 non-null   int64  
 3   meaning_percentage                    1558 non-null   int64  
 4   stem_percentage                       1558 non-null   int64  
 5   engineering                           1558 non-null   bool   
 6   private_school                        1558 non-null   bool   
 7   religious                             1558 non-null   bool   
 8   art                                   1558 non-null   bool   
 9   for_sports_fans                       1558 non-null   bool   
 10  party_school                          1558 non-null   bool   
 11  liberal_arts_scho

### Dealing with Null Values 

We find null values often where the DataFrames did not match, about 500 missing values for enrollment data, and about 300 missing values for tuition data. Room and board has about 400 missing values, but that data is also integrated into total tuition costs, so we'll get rid of that column. Since enrollment doesn't follow a normal distribution, I'm going to fill the null values with the median for each. 

In [15]:
# Drop unnecessary columns 
colleges_df.drop(columns=['school_name', 'mid_career_pay', 'room_and_board'], inplace=True)

In [16]:
# Fill in missing enrollment data 
enrollment = ['total_enrollment', 'AIAN_enrollment_percentage', 'Asian_enrollment_percentage', 'Black_enrollment_percentage','Hispanic_enrollment_percentage', 'NHPI_enrollment_percentage', 'non_resident_enrollment_percentage', 'total_minority_enrollment_percentage', 'unknown_enrollment_percentage', 'White_enrollment_percentage', 'women_enrollment_percentage']
for enroll in enrollment:
    colleges_df[enroll].fillna(colleges_df[enroll].median(), inplace=True)
    
colleges_df = colleges_df.dropna()

# Finalize model, engineer target value
colleges_df['over_60000'] = colleges_df['early_career_pay'] >= 60000
colleges_df['over_60000'] = colleges_df['over_60000'].astype(str)
colleges_df.drop(columns=['early_career_pay'], inplace=True)
colleges_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1557
Data columns (total 34 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   meaning_percentage                    1201 non-null   int64  
 1   stem_percentage                       1201 non-null   int64  
 2   engineering                           1201 non-null   bool   
 3   private_school                        1201 non-null   bool   
 4   religious                             1201 non-null   bool   
 5   art                                   1201 non-null   bool   
 6   for_sports_fans                       1201 non-null   bool   
 7   party_school                          1201 non-null   bool   
 8   liberal_arts_school                   1201 non-null   bool   
 9   state_school                          1201 non-null   bool   
 10  research_university                   1201 non-null   bool   
 11  business         

### Baseline Logistic Regression Model with Scikit Learn 

In [19]:
# import necessary libraries
import statsmodels as sm 
import sklearn.preprocessing as preprocessing 
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split 
from scipy import stats

X = colleges_df.drop(columns=['over_60000'], axis=1)
X = pd.get_dummies(X, drop_first=True, dtype=float)

y = pd.get_dummies(colleges_df['over_60000'], drop_first=True, dtype=float)
y = y['True']

In [32]:
bool_columns = ['engineering', 'private_school', 'religious', 'art', 'for_sports_fans', 'party_school', 'liberal_arts_school', 'state_school', 'research_university', 'business', 'sober_school', 'ivy_league']
X[bool_columns] = X[bool_columns].astype(float)

In [33]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1557
Data columns (total 90 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   const                                 1201 non-null   float64
 1   meaning_percentage                    1201 non-null   int64  
 2   stem_percentage                       1201 non-null   int64  
 3   engineering                           1201 non-null   float64
 4   private_school                        1201 non-null   float64
 5   religious                             1201 non-null   float64
 6   art                                   1201 non-null   float64
 7   for_sports_fans                       1201 non-null   float64
 8   party_school                          1201 non-null   float64
 9   liberal_arts_school                   1201 non-null   float64
 10  state_school                          1201 non-null   float64
 11  research_universi

In [34]:
X = sm.tools.add_constant(X)
logit_model = sm.discrete.discrete_model.Logit(y, X)
result = logit_model.fit()

         Current function value: inf
         Iterations: 35


  return 1/(1+np.exp(-X))
  return np.sum(np.log(self.cdf(q*np.dot(X,params))))


LinAlgError: Singular matrix

In [30]:
npl

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1557
Data columns (total 90 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   const                                 1201 non-null   float64
 1   meaning_percentage                    1201 non-null   int64  
 2   stem_percentage                       1201 non-null   int64  
 3   engineering                           1201 non-null   bool   
 4   private_school                        1201 non-null   bool   
 5   religious                             1201 non-null   bool   
 6   art                                   1201 non-null   bool   
 7   for_sports_fans                       1201 non-null   bool   
 8   party_school                          1201 non-null   bool   
 9   liberal_arts_school                   1201 non-null   bool   
 10  state_school                          1201 non-null   bool   
 11  research_universi

In [28]:
np.asarray(y)

array([1., 1., 1., ..., 0., 0., 0.])

### Building a Model with Statsmodel

In [4]:
# import library
import statsmodels.api as sm
import numpy as np

X = colleges_df['over_60000']
X_ = sm.add_constant(X)
y = colleges_df.drop(columns=[''])

# Change booleans into categorical values
#school_types = ['engineering', 'private_school', 'religious', 'art', 'for_sports_fans', 'party_school', 'liberal_arts_school', 'state_school' , 'research_university', 'business', 'sober_school', 'ivy_league']
#colleges_df[school_types] = colleges_df[school_types].astype(int)

# Prepare data for model  
#continuous = ['meaning_percentage', 'stem_percentage', 'in_state_tuition', 'in_state_total', 'out_of_state_tuition', 'out_of_state_total', 'total_enrollment', 'AIAN_enrollment_percentage', 'Asian_enrollment_percentage', 'Black_enrollment_percentage', 'Hispanic_enrollment_percentage', 'NHPI_enrollment_percentage', 'non_resident_enrollment_percentage', 'unknown_enrollment_percentage', 'White_enrollment_percentage', 'women_enrollment_percentage']
#categoricals = ['engineering', 'private_school', 'religious', 'art', 'for_sports_fans', 'party_school', 'liberal_arts_school', 'state_school', 'research_university', 'business', 'sober_school', 'ivy_league', 'state_code', 'type', 'degree_length', 'school_rank']



# normalize (subract mean and divide by std)


colleges_log_norm = colleges_log.apply(normalize)

NameError: name 'colleges_log' is not defined

In [6]:
colleges_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1201 entries, 0 to 1557
Data columns (total 34 columns):
 #   Column                                Non-Null Count  Dtype  
---  ------                                --------------  -----  
 0   meaning_percentage                    1201 non-null   int64  
 1   stem_percentage                       1201 non-null   int64  
 2   engineering                           1201 non-null   int64  
 3   private_school                        1201 non-null   int64  
 4   religious                             1201 non-null   int64  
 5   art                                   1201 non-null   int64  
 6   for_sports_fans                       1201 non-null   int64  
 7   party_school                          1201 non-null   int64  
 8   liberal_arts_school                   1201 non-null   int64  
 9   state_school                          1201 non-null   int64  
 10  research_university                   1201 non-null   int64  
 11  business         

In [None]:
# Categorical Variable 
colleges_ohe = pd.get_dummies(colleges_df[categoricals], prefix=categoricals, drop_first=True)

In [None]:
colleges_ohe

In [None]:
colleges_df['over_60000'] = colleges_df['over_60000'].astype(int)

In [None]:
preprocessed_colleges = pd.concat([colleges_log_norm, colleges_ohe])
preprocessed_colleges.info()

In [None]:
X_train = colleges_ohe 
y_train = colleges_df['over_60000']

log_reg = sm.Logit(y_train, X_train).fit()