In [None]:
import pandas as pd
import yellowbrick as yb
import os
from yellowbrick.classifier import ConfusionMatrix
from sklearn.linear_model import LogisticRegression
import pandas
import matplotlib.pyplot as plt
from sklearn.model_selection import KFold
from sklearn.preprocessing import MinMaxScaler
from sklearn.svm import SVR
import numpy as np
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import cross_val_predict
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from yellowbrick.classifier import ROCAUC
from sklearn.model_selection import train_test_split
import seaborn as sns
from imblearn.over_sampling import SMOTE
import warnings
from sklearn.model_selection import TimeSeriesSplit
from sklearn.naive_bayes import GaussianNB
from yellowbrick.model_selection import FeatureImportances
from yellowbrick.classifier import ClassificationReport
warnings.filterwarnings('ignore')
%matplotlib inline
gnb = GaussianNB()
knn = KNeighborsClassifier()

In [None]:
#Display all columns 
pd.set_option('display.max_columns', None)

In [None]:
#View first 5 instances
df = pd.read_csv('higher_ed_summary.csv')
df.head()

In [None]:
#Number of closed schools
df['closed_ind'].sum()

In [None]:
df.shape

In [None]:
#plot different features v closed_ind
%matplotlib inline

In [None]:
#visualize between the relationship between the features and target using scatterplots
sns.pairplot(df, x_vars=['avg_percent_ft_ug_aid','slope_grad_enrollment','sector'], y_vars='closed_ind', height =4, aspect=0.7, kind='reg')

In [None]:
df.head()

In [None]:
#Drop instances with less than 300 features.
df1 = df.dropna(axis=0, thresh=300)
df1.head()

In [None]:
df1.shape

In [None]:
#Drop features wil null values.
df2 = df1.dropna(axis=1)
df2.head()

In [None]:
df2.shape

In [None]:
#Pick target feature
X = df2.loc[:, ~df2.columns.isin(['closed_ind','instname','unitid'])]
y = df2.loc[:, 'closed_ind']

In [None]:
#Maps all feature values to the interval (0,1)
scaler = MinMaxScaler(feature_range=(0, 1))
X = scaler.fit_transform(X)

# Evaluate the models using k-fold cross-validation

In [None]:
from yellowbrick.target import ClassBalance

X = df2.loc[:, ~df2.columns.isin(['closed_ind','instname','unitid'])]
y = df2.loc[:, 'closed_ind']

# Instantiate the visualizer
visualizer = ClassBalance(labels=["open", "closed"])

visualizer.fit(y)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
#Data imbalance in our dataset
y.value_counts()

In [None]:
#Import SMOTE to address class imbalance
smt = SMOTE()
X, y = smt.fit_sample(X, y)

In [None]:
# Instantiate the visualizer
visualizer = ClassBalance(labels=["open", "closed"])

visualizer.fit(y)        # Fit the data to the visualizer
visualizer.show()        # Finalize and render the figure

In [None]:
y.value_counts()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)

# Logistic Regression

In [None]:
cross_val_score(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y,cv=5)

In [None]:
cross_val_predict(LogisticRegression(solver='liblinear',multi_class='ovr'), X, y,cv=5)

In [None]:

viz = ConfusionMatrix(LogisticRegression())
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()

In [None]:
X = df2.loc[:, ~df2.columns.isin(['closed_ind','instname','unitid'])]
y = df2.loc[:, 'closed_ind']
X_train, X_test, y_train, y_test = train_test_split(X, y)
oz = ROCAUC(LogisticRegression())
oz.fit(X_train, y_train)
oz.score(X_test, y_test)
oz.show()

In [None]:
X = df2.loc[:, ~df2.columns.isin(['closed_ind','instname','unitid'])]
y = df2.loc[:, 'closed_ind']

# Label the target classes
classes = ["open", "closed"]

# Create the training and test data
tscv = TimeSeriesSplit()
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Instantiate the classification model and visualizer
model = LogisticRegression()
visualizer = ClassificationReport(model, classes=classes, support=True)

visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

# Support Vector Machine

In [None]:
cross_val_score(SVC(gamma='auto'), X, y,cv=10)


In [None]:
cross_val_predict(SVC(gamma='auto'), X, y,cv=10)

In [None]:
viz = ConfusionMatrix(SVC())
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()

In [None]:
X = df2.loc[:, ~df2.columns.isin(['closed_ind','instname','unitid'])]
y = df2.loc[:, 'closed_ind']

# Specify the target classes
classes = ["open", "closed"]

# Create the training and test data
tscv = TimeSeriesSplit()
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Instantiate the classification model and visualizer
model = SVC()
visualizer = ClassificationReport(model, classes=classes, support=True)

visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

# Random Forest Classifier 

In [None]:
cross_val_score(RandomForestClassifier(n_estimators=30), X, y,cv=10)

In [None]:
cross_val_predict(RandomForestClassifier(n_estimators=30), X, y,cv=10)

In [None]:
viz = ConfusionMatrix(RandomForestClassifier())
viz.fit(X_train, y_train)
viz.score(X_test, y_test)
viz.show()

In [None]:
X_train, X_test, y_train, y_test = train_test_split(X, y)
oz = ROCAUC(RandomForestClassifier())
oz.fit(X_train, y_train)
oz.score(X_test, y_test)
oz.show()

In [None]:

X = df2.loc[:, ~df2.columns.isin(['closed_ind','instname','unitid'])]
y = df2.loc[:, 'closed_ind']

# Specify the target classes
classes = ["open", "closed"]

# Create the training and test data
tscv = TimeSeriesSplit()
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Instantiate the classification model and visualizer
model = RandomForestClassifier()
visualizer = ClassificationReport(model, classes=classes, support=True)

visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

# Naive Bayes 

In [None]:
cross_val_score(gnb, X, y, cv=10, n_jobs=1)

In [None]:
cross_val_predict(gnb, X, y,cv=5, n_jobs=1)

In [None]:
X = df2.loc[:, ~df2.columns.isin(['closed_ind','instname','unitid'])]
y = df2.loc[:, 'closed_ind']

# Specify the target classes
classes = ["open", "closed"]

# Create the training and test data
tscv = TimeSeriesSplit()
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Instantiate the classification model and visualizer
model = RandomForestClassifier()
visualizer = ClassificationReport(model, classes=classes, support=True)

visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

# K-Nearest Neighbors

In [None]:
cross_val_score(knn, X, y, cv=10)

In [None]:
cross_val_predict(knn, X, y,cv=10)

In [None]:
X = df2.loc[:, ~df2.columns.isin(['closed_ind','instname','unitid'])]
y = df2.loc[:, 'closed_ind']

# Specify the target classes
classes = ["open", "closed"]

# Create the training and test data
tscv = TimeSeriesSplit()
for train_index, test_index in tscv.split(X):
    X_train, X_test = X.iloc[train_index], X.iloc[test_index]
    y_train, y_test = y.iloc[train_index], y.iloc[test_index]

# Instantiate the classification model and visualizer
model = GaussianNB()
visualizer = ClassificationReport(model, classes=classes, support=True)

visualizer.fit(X_train, y_train)        # Fit the visualizer and the model
visualizer.score(X_test, y_test)        # Evaluate the model on the test data
visualizer.show()                       # Finalize and show the figure

In [None]:
# Load the classification data set
X = df2.loc[:, ~df2.columns.isin(['closed_ind','instname','unitid'])]
y = df2.loc[:, 'closed_ind']
model = RandomForestClassifier(n_estimators=10)
viz = FeatureImportances(model)
viz.fit(X, y)
viz.show()
