In [None]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, export_graphviz
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report, plot_confusion_matrix
from sklearn.model_selection import GridSearchCV

In [None]:
temp_df = pd.read_csv('dataset.csv')
temp_df.head()

In [None]:
temp_df.dtypes

In [None]:
#cheking NULL Values
temp_df.isnull().sum()

In [None]:
temp_df.Target.unique()

In [None]:
temp_df.drop(temp_df[temp_df["Target"]=="Enrolled"].index,inplace=True)

In [None]:
#String to Int for the output
# Enrolled - 1, Graduate - 2, Dropout - 3

def f(s):
    if s == 'Graduate':
        return 1;
    if s == 'Dropout':
        return 2;
    
temp_df.Target = temp_df.Target.apply(f)
temp_df.head()

In [None]:
temp_df.shape

In [None]:
X = temp_df.iloc[:,0:34]
Y = temp_df.iloc[:,34]

In [None]:
X.shape, Y.shape

In [None]:
X_train, X_test, Y_train, Y_test= train_test_split(X,Y,random_state=1)
X_train.shape, X_test.shape, Y_train.shape, Y_test.shape

In [None]:
#Logistic Regression
logistic_regression = LogisticRegression(max_iter=1000000)
logistic_regression.fit(X_train,Y_train)
logistic_regression_train_prediction = logistic_regression.predict(X_train)
logistic_regression_test_prediction = logistic_regression.predict(X_test)
confusion_matrix(Y_train, logistic_regression_train_prediction), confusion_matrix(Y_test, logistic_regression_test_prediction)

In [None]:
color = 'white'
matrix = plot_confusion_matrix(logistic_regression, X_test, Y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
print(classification_report(Y_test, logistic_regression_test_prediction))

In [None]:
grid_logistic_regression = LogisticRegression(C=2, max_iter=500, solver='saga')
grid_logistic_regression.fit(X_train,Y_train)
grid_logistic_regression_train_prediction = grid_logistic_regression.predict(X_train)
grid_logistic_regression_test_prediction = grid_logistic_regression.predict(X_test)
confusion_matrix(Y_train, grid_logistic_regression_train_prediction), confusion_matrix(Y_test, grid_logistic_regression_test_prediction)

In [None]:
color = 'white'
matrix = plot_confusion_matrix(grid_logistic_regression, X_test, Y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
print(classification_report(Y_test, grid_logistic_regression_test_prediction))

In [None]:
#Decision Tree
decision_tree = DecisionTreeClassifier()
decision_tree.fit(X_train,Y_train)
decision_tree_train_prediction = decision_tree.predict(X_train)
decision_tree_test_prediction = decision_tree.predict(X_test)
confusion_matrix(Y_train, decision_tree_train_prediction), confusion_matrix(Y_test, decision_tree_test_prediction)

In [None]:
color = 'white'
matrix = plot_confusion_matrix(decision_tree, X_test, Y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
print(classification_report(Y_test, decision_tree_test_prediction))

In [None]:
grid_decision_tree = DecisionTreeClassifier(max_depth=5, random_state=1024)
grid_decision_tree.fit(X_train,Y_train)
grid_decision_tree_train_prediction = grid_decision_tree.predict(X_train)
grid_decision_tree_test_prediction = grid_decision_tree.predict(X_test)
confusion_matrix(Y_train, grid_decision_tree_train_prediction), confusion_matrix(Y_test, grid_decision_tree_test_prediction)

In [None]:
color = 'white'
matrix = plot_confusion_matrix(grid_decision_tree, X_test, Y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
print(classification_report(Y_test, grid_decision_tree_test_prediction))

In [None]:
feature_names_arr = ['Marital status', 'Application mode', 'Application order',
       'Course', 'Daytime/evening attendance', 'Previous qualification',
       'Nationality', "Mother's qualification", "Father's qualification",
       "Mother's occupation", "Father's occupation", 'Displaced',
       'Educational special needs', 'Debtor', 'Tuition fees up to date',
       'Gender', 'Scholarship holder', 'Age at enrollment',
       'International', 'Curricular units 1st sem (credited)',
       'Curricular units 1st sem (enrolled)',
       'Curricular units 1st sem (evaluations)',
       'Curricular units 1st sem (approved)',
       'Curricular units 1st sem (grade)',
       'Curricular units 1st sem (without evaluations)',
       'Curricular units 2nd sem (credited)',
       'Curricular units 2nd sem (enrolled)',
       'Curricular units 2nd sem (evaluations)',
       'Curricular units 2nd sem (approved)',
       'Curricular units 2nd sem (grade)',
       'Curricular units 2nd sem (without evaluations)',
       'Unemployment rate', 'Inflation rate', 'GDP']

In [None]:
import pydotplus
dot_data = export_graphviz(decision_tree, out_file=None, feature_names=feature_names_arr, class_names=['Dropout', 'Graduate', 'Enrolled'])
graph = pydotplus.graph_from_dot_data(dot_data) # pydotplus return graph object of .dot file
graph.write_pdf('data_vis.pdf')    

In [None]:
#KNN
knn_classifier = KNeighborsClassifier()
knn_classifier.fit(X_train,Y_train)
knn_classifier_train_prediction = knn_classifier.predict(X_train)
knn_classifier_test_prediction = knn_classifier.predict(X_test)
confusion_matrix(Y_train, knn_classifier_train_prediction), confusion_matrix(Y_test, knn_classifier_test_prediction)

In [None]:
color = 'white'
matrix = plot_confusion_matrix(knn_classifier, X_test, Y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
print(classification_report(Y_test, knn_classifier_test_prediction))

In [None]:
#Support Vector Classifier
svm_classifier = SVC()
svm_classifier.fit(X_train,Y_train)
svm_classifier_train_prediction = svm_classifier.predict(X_train)
svm_classifier_test_prediction = svm_classifier.predict(X_test)
confusion_matrix(Y_train, svm_classifier_train_prediction), confusion_matrix(Y_test, svm_classifier_test_prediction)

In [None]:
color = 'white'
matrix = plot_confusion_matrix(svm_classifier, X_test, Y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
print(classification_report(Y_test, svm_classifier_test_prediction))

In [None]:
#Grid Search Result
grid_svm_classifier = SVC(C=100, gamma=0.0001)
grid_svm_classifier.fit(X_train,Y_train)
grid_svm_classifier_train_prediction = grid_svm_classifier.predict(X_train)
grid_svm_classifier_test_prediction = grid_svm_classifier.predict(X_test)
confusion_matrix(Y_train, grid_svm_classifier_train_prediction), confusion_matrix(Y_test, grid_svm_classifier_test_prediction)

In [None]:
color = 'white'
matrix = plot_confusion_matrix(grid_svm_classifier, X_test, Y_test, cmap=plt.cm.Blues)
matrix.ax_.set_title('Confusion Matrix', color=color)
plt.xlabel('Predicted Label', color=color)
plt.ylabel('True Label', color=color)
plt.gcf().axes[0].tick_params(colors=color)
plt.gcf().axes[1].tick_params(colors=color)
plt.show()
print(classification_report(Y_test, grid_svm_classifier_test_prediction))

In [None]:
#Grid Search
clf = LogisticRegression()
grid = {'C' : [1,2,3,4,5] , 'solver' : ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'] , 'max_iter' : [100,300,500,700]}
abc = GridSearchCV(clf, grid)
abc.fit(X_train, Y_train)

In [None]:
abc.best_estimator_

In [None]:
param_grid = {'C': [0.1, 1, 10, 100, 1000], 
              'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
              'kernel': ['rbf']} 
  
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
grid.fit(X_train, Y_train)

In [None]:
grid.best_estimator_

In [None]:
grid_search_logistic_regression_train_prediction = abc.predict(X_train)
grid_search_logistic_regression_test_prediction = abc.predict(X_test)
confusion_matrix(Y_train, grid_search_logistic_regression_train_prediction), confusion_matrix(Y_test, grid_search_logistic_regression_test_prediction)

In [None]:
param_grid = {'criterion':['gini','entropy'],'max_depth':[4,5,6,7,8,9,10,11,12,15,20,30,40,50,70,90,120,150]}
tree_clas = DecisionTreeClassifier(random_state=1024)
grid_search = GridSearchCV(estimator=tree_clas, param_grid=param_grid, cv=5, verbose=True)
grid_search.fit(X_train, Y_train)

In [None]:
grid_search.best_estimator_