Importing packages & reading data

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt 

from sklearn import preprocessing
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn import tree
from sklearn.model_selection import train_test_split
from sklearn import metricsfrom sklearn.metrics import accuracy_score
from sklearn.metrics import mean_squared_error, mean_absolute_error, r2_score, classification_report
from sklearn.metrics import accuracy_score
plt.rc("font", size=14)

import seaborn as sns
sns.set(style="white")
sns.set(style="whitegrid", color_codes=True)

#READING DATA
data = pd.read_csv('data/College.csv',na_values=['?'],index_col=0)
data = pd.read_csv('data/College.csv',na_values=['?'],index_col='names')

High-level looking at data

In [None]:
#SCATTER MATRIX
pd.plotting.scatter_matrix(data[['X1','X2','X3']])

#CORRELATION TABLE
data.corr(numeric_only=True)

#COUNTS PLOT
data['y'].value_counts()
sns.countplot(x='y',data=data)
count_no_sub = len(data[data['y']==0])
count_sub = len(data[data['y']==1])
pct_of_no_sub = count_no_sub/(count_no_sub+count_sub)

#GROUPING NUMERICAL
data.groupby("age").mean()

#FREQUENCY PLOT
pd.crosstab(data.job,data.y).plot(kind='bar')
plt.title('Purchase Frequency for Job Title')

Manipulating & Preparing Dataset

In [None]:
#JOIN DATA BY ADDING MORE COLUMNS
result = pd.concat([data1,data2],axis=1)

#BINNING DATA
data['binned'] = pd.cut(data['feature'],bins=[0,50,100],labels=['No','Yes'])

#REPLACING STUFF
data['education']=np.where(data['education'] =='basic.4y', 'Basic', data['education'])

#DROPPING DATA
data.drop(['droptitle'], axis = 1)

#DUMMY
dummy_columns = pd.get_dummies(data[['A','B','C']]) #A, B, C are categorical, become columns of true/false
#add the above to the existing numerical datasets

Linear Regression

In [None]:
feature_cols = ['feature1','feature2','feature3']
X = data[feature_cols]
Y = data['column3']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

linreg = LinearRegression()
linreg.fit(X_train,Y_train)
Y_pred = linreg.predict(X_test)

intercept = linreg.intercept_
coeff = linreg.coef_

linreg.score(X,Y) #Single Variable
r2_score(Y_test,Y_pred)

plt.scatter(data['X'],data['Y'])
plt.plot(data['X'],coeff[0]*data['X']+intercept,color='red')

Linear Regression with Power Law

In [None]:
data['logX'] = np.log(data['X'])
data['logY'] = np.log(data['Y'])

loglinreg = LinearRegression()
loglinreg.fit(data[['logX']],data['logY'])
exponent = loglinreg.coef_
logk = loglinreg.intercept_

plt.scatter(data['X'],college['Y'])
plt.plot(np.sort(data['X']),np.sort(np.power(data['X'],exponent)*np.exp(logk)))

Logistic Regression

In [None]:
feature_cols = ['feature1','feature2','feature3']
X = data[feature_cols]
Y = data['column3']

X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25)

logreg = LogisticRegression(random_state=16)
logreg.fit(X_train, Y_train)
Y_pred = logreg.predict(X_test)

cnf_matrix = metrics.confusion_matrix(Y_test, Y_pred)
logreg.score(X_test,Y_test)
classification_report(Y_test, Y_pred)

Plotting LogReg Results: Confusion Matrix, predicted vs actual final class

In [None]:
class_names=['A','B'] # name  of classes
fig, ax = plt.subplots()
tick_marks = np.arange(len(class_names))
plt.xticks(tick_marks, class_names)
plt.yticks(tick_marks, class_names)

sns.heatmap(pd.DataFrame(cnf_matrix), annot=True, cmap="Blues_r" ,fmt='g')
ax.xaxis.set_label_position("top")
plt.tight_layout()
plt.title('Confusion matrix', y=1.1)
plt.ylabel('Actual')
plt.xlabel('Predicted')

Decision Tree Classifier

In [None]:
X = data.drop(['class'], axis = 1)
Y = data['class']
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size = 0.2, random_state = 13)

treeclass = DecisionTreeClassifier(random_state = 13)
treeclass.fit(X_train, Y_train)
Y_pred = treeclass.predict(X_test)

accuracy = accuracy_score(Y_pred, Y_test)

# FEATURE IMPORTANCE
sns.barplot(x=data.columns, y=treeclass.feature_importances_)
plt.xticks(rotation = 50)
plt.show()

# VISUALISE TREE
fig = plt.figure(figsize = (10, 7))
tree.plot_tree(treeclass, feature_names = data.feature_names,class_names = data.target_names, filled = True)
plt.show()