In [2]:
# required libraries
import pandas as pd
import numpy as np
from sklearn.model_selection import ShuffleSplit
# from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.feature_selection import RFE
from sklearn.svm import SVR
from sklearn.metrics import accuracy_score


In [3]:
df_master = pd.read_csv('PublicSchools2014to2017_YZ.csv')

df_headers = ['TotalTargets_pTarget_PctMet',
    # 'EVAAS Growth Status_NotMet',
    'MinorityMalePct',
    'MinorityFemalePct',
    # 'title1_type_cd_Y',
    'avg_daily_attend_pct',
    'short_susp_per_c_num',
    'BlackMalePct',
    'AsianFemalePct',
    'student_num',
    'HispanicMalePct',
    'SciGr5&8_pTarget_PctMet',
    'tchyrs_0thru3_pct',
    'tchyrs_11plus_pct',
    'Accomplished_TCHR_Standard 2_Pct',
    'Accomplished_TCHR_Standard 1_Pct',
    'Developing_TCHR_Standard 1_Pct',
    'Developing_TCHR_Standard 2_Pct',
    'Developing_TCHR_Standard 3_Pct',
    'Accomplished_TCHR_Standard 4_Pct',
    '4-10 Years_LEA_Exp_Pct_Prin',
    'Developing_TCHR_Standard 4_Pct',
    'Developing_TCHR_Standard 5_Pct',
    '10+ Years_LEA_Exp_Pct_Prin',
    'Accomplished_TCHR_Standard 3_Pct',
    'Accomplished_TCHR_Standard 5_Pct',
    'lea_state_perpupil_num',
    'st_emer_prov_teach_pct',
    'pct_GCE_ALL',
    'MathGr3-8_pTarget_PctMet',
    'lea_sat_avg_score_num',
    'lea_federal_perpupil_num',
    'lea_local_perpupil_num',
    'nbpts_num',
    'Distinguished_TCHR_Standard 2_Pct',
    '_1yr_tchr_trnovr_pct',
    'lateral_teach_pct',
    '0-3 Years_LEA_Exp_Pct_Prin',
    'lea_flicensed_teach_pct',
    'lea_tchyrs_4thru10_pct',
    'lea_tchyrs_11plus_pct',
    'lea_nbpts_num',
    'lea_advance_dgr_pct',
    'lea_1yr_tchr_trnovr_pct',
    'lea_emer_prov_teach_pct',
    'st_flicensed_teach_pct',
    'st_tchyrs_0thru3_pct',
    'st_1yr_tchr_trnovr_pct',
    'lea_tchyrs_0thru3_pct',
    # 'Category_Cd_T'
]

In [4]:
# data clean up

# schools2['category_cd'].unique()
df_master['category_cd_modified'] = np.select(
    [
        df_master['category_cd'] == 'A', 
        df_master['category_cd'] == 'E',
        df_master['category_cd'] == 'H',
        df_master['category_cd'] == 'I',
        df_master['category_cd'] == 'M'
    ], 
    [
        'Elem./Mid./High Together', 
        'Elementary School',
        'High School',
        'Elem./Mid. Together',
        'Middle School'
    ],
    default='Mid./High Together'
)

combo = df_master['category_cd_modified'].str.contains('/', regex=False)

df_master['category_cd_modified'] = np.where(combo, 'Combo', df_master['category_cd_modified'])

df_master['MinorityOverallPct'] = df_master['MinorityMalePct'] + df_master['MinorityFemalePct']

df_master['Majority_Minority'] = np.where(df_master['MinorityOverallPct'] > .5, 1,0)

df_final = df_master[df_master["school_type_txt"] == 'Regular School']


combo = df_final['SPG Grade'].str.contains('A+NG', regex=False)

df_final['SPG Grade'] = np.where(combo, 'A', df_final['SPG Grade'])

# df_final['SPG Grade'].value_counts()

df_final = df_final[df_final['SPG Grade'] != 'I']

df_final['SPG Grade'].value_counts()

df_master = df_final

df_master.info()

# pd.DataFrame(df_master["school_type_txt"].unique())

# pd.DataFrame(df_master['Majority_Minority'].value_counts())

# pd.DataFrame(df_master.category_cd_modified.value_counts())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 8675 entries, 0 to 9730
Columns: 262 entries, vphone_ad to Majority_Minority
dtypes: float64(238), int32(1), int64(2), object(21)
memory usage: 17.4+ MB


In [5]:
if 'SPG Score' in df_master:
    y = df_master['SPG Grade'].values
    X = df_master[df_headers].values

num_cv_iterations = 3
num_instances = len(y)
cv_object = ShuffleSplit(n_splits=num_cv_iterations,
                        test_size= 0.2)

print(cv_object)

ShuffleSplit(n_splits=3, random_state=None, test_size=0.2, train_size=None)


In [6]:
for train_indices, test_indices in cv_object.split(X,y):
    
    X_train = X[train_indices]
    y_train = y[train_indices]
    
    X_test = X[test_indices]
    y_test = y[test_indices]

# StandardScaler
scl_obj = StandardScaler()
scl_obj.fit(X_train)

X_train_scaled = scl_obj.transform(X_train)
X_test_scaled = scl_obj.transform(X_test)

In [7]:
model = LogisticRegression()

model.fit(X_train_scaled, y_train)
pred = model.predict(X_test_scaled)

accuracy_score(y_test, pred)

0.652449567723343

In [8]:
#Lasso
model2 = LogisticRegression(penalty='l1', solver='liblinear')

model2.fit(X_train_scaled, y_train)
pred = model2.predict(X_test_scaled)

accuracy_score(y_test, pred)

0.624207492795389

In [9]:
from sklearn.model_selection import GridSearchCV
# from sklearn.pipeline import Pipeline
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.feature_selection import 

# pipe = Pipeline([RandomForestClassifier(n_estimators=)])

param_grid = {'C': [0.1, 10, 100], 'penalty': ['l1','l2'], 'solver': ['liblinear']}
 

grid = GridSearchCV(LogisticRegression(), param_grid, refit=True, verbose=2)
grid.fit(X_train_scaled, y_train)

Fitting 5 folds for each of 6 candidates, totalling 30 fits
[CV] C=0.1, penalty=l1, solver=liblinear .............................
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.
[CV] .............. C=0.1, penalty=l1, solver=liblinear, total=   0.3s
[CV] C=0.1, penalty=l1, solver=liblinear .............................
[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    0.2s remaining:    0.0s
[CV] .............. C=0.1, penalty=l1, solver=liblinear, total=   0.3s
[CV] C=0.1, penalty=l1, solver=liblinear .............................
[CV] .............. C=0.1, penalty=l1, solver=liblinear, total=   0.3s
[CV] C=0.1, penalty=l1, solver=liblinear .............................
[CV] .............. C=0.1, penalty=l1, solver=liblinear, total=   0.3s
[CV] C=0.1, penalty=l1, solver=liblinear .............................
[CV] .............. C=0.1, penalty=l1, solver=liblinear, total=   0.4s
[CV] C=0.1, penalty=l2, solver=liblinear ............................

GridSearchCV(cv=None, error_score=nan,
             estimator=LogisticRegression(C=1.0, class_weight=None, dual=False,
                                          fit_intercept=True,
                                          intercept_scaling=1, l1_ratio=None,
                                          max_iter=100, multi_class='auto',
                                          n_jobs=None, penalty='l2',
                                          random_state=None, solver='lbfgs',
                                          tol=0.0001, verbose=0,
                                          warm_start=False),
             iid='deprecated', n_jobs=None,
             param_grid={'C': [0.1, 10, 100], 'penalty': ['l1', 'l2'],
                         'solver': ['liblinear']},
             pre_dispatch='2*n_jobs', refit=True, return_train_score=False,
             scoring=None, verbose=2)

In [10]:
print(grid.best_params_)

{'C': 100, 'penalty': 'l1', 'solver': 'liblinear'}


In [11]:
from sklearn.metrics import classification_report, confusion_matrix

grid_predictions = grid.predict(X_test_scaled)
print(confusion_matrix(y_test, grid_predictions))
print(classification_report(y_test, grid_predictions))

[[ 47  45   6   1   0]
 [ 13 209 210   4   0]
 [  1  72 636  69   0]
 [  0   2 154 173   5]
 [  0   0   6  66  16]]
              precision    recall  f1-score   support

           A       0.77      0.47      0.59        99
           B       0.64      0.48      0.55       436
           C       0.63      0.82      0.71       778
           D       0.55      0.52      0.53       334
           F       0.76      0.18      0.29        88

    accuracy                           0.62      1735
   macro avg       0.67      0.49      0.53      1735
weighted avg       0.63      0.62      0.61      1735



In [12]:
from imblearn.over_sampling import SMOTE

X_train_resampled, y_train_resampled = SMOTE().fit_resample(X_train_scaled,y_train)

grid.fit(X_train_resampled, y_train_resampled)

grid_predictions2 = grid.predict(X_test)

pred_compare3 = pd.DataFrame({'preds': grid_predictions2, 'actual': y_test})[['preds', 'actual']]

acc = mt.accuracy_score(y_test, grid_predictions2)

print(acc)

pd.crosstab(pred_compare3['preds'], pred_compare3['actual'])

AttributeError: 'SMOTE' object has no attribute '_validate_data'