# Testing Classifiers on Massachusetts College Town data

Load a dataframe of different cities/towns with a count of their total colleges, population in 2000, fraction of the population possessing a bachelor's, master's or doctorate as their highest degree, and the per capita income in US Dollars.<br>
Drop any cities where any of this information was not available.

In [5]:
import pandas as pd
import random
import matplotlib.pyplot as plt
df = pd.read_csv(output_path, index_col=0).dropna()
df

Unnamed: 0_level_0,college_count,population_2000,bachelors_degree,masters_degree,doctorate_degree,per_capita_income
town,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
ABINGTON,0,14605,0.271,0.091,0.002,45174.0
ACTON,0,20331,0.332,0.283,0.080,65952.0
ACUSHNET,0,10161,0.165,0.033,0.003,39457.0
ADAMS,0,8809,0.165,0.057,0.007,34017.0
AGAWAM,0,28144,0.210,0.122,0.009,42754.0
...,...,...,...,...,...,...
WINTHROP,0,18303,0.276,0.094,0.014,49915.0
WOBURN,7,37258,0.279,0.156,0.022,50498.0
WORCESTER,15,172648,0.188,0.085,0.020,30855.0
WRENTHAM,0,10554,0.350,0.134,0.017,62721.0


We'll classify each city as a `college_town` or not if its `college_count` is non-zero. We'll also create a new column, `graduate_degree`, defined as the sum of `masters_degree` and `doctorate_degree`.

In [6]:
df["is_college_town"] = df["is_college_town"].astype("bool")
df["graduate_degree"] = df["masters_degree"] + df["doctorate_degree"]
df

Unnamed: 0_level_0,college_count,population_2000,bachelors_degree,masters_degree,doctorate_degree,per_capita_income,college_town,graduate_degree
town,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
ABINGTON,0,14605,0.271,0.091,0.002,45174.0,False,0.093
ACTON,0,20331,0.332,0.283,0.080,65952.0,False,0.363
ACUSHNET,0,10161,0.165,0.033,0.003,39457.0,False,0.036
ADAMS,0,8809,0.165,0.057,0.007,34017.0,False,0.064
AGAWAM,0,28144,0.210,0.122,0.009,42754.0,False,0.131
...,...,...,...,...,...,...,...,...
WINTHROP,0,18303,0.276,0.094,0.014,49915.0,False,0.108
WOBURN,7,37258,0.279,0.156,0.022,50498.0,True,0.178
WORCESTER,15,172648,0.188,0.085,0.020,30855.0,True,0.105
WRENTHAM,0,10554,0.350,0.134,0.017,62721.0,False,0.151


## Decision Tree Classifier

In [10]:
from sklearn.tree import DecisionTreeClassifier, export_text
from sklearn.model_selection import train_test_split
import sklearn.metrics

cols = ["per_capita_income", "population_2000", "graduate_degree"]
X = df[cols].to_numpy()
y = df["college_town"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7, random_state=1)

def generate_and_test_decision_tree(max_depth, min_samples_leaf):
    decisionTree = DecisionTreeClassifier(max_depth = max_depth, min_samples_leaf = min_samples_leaf).fit(X_train, y_train)
    print(f"\nOverall performance for max_depth {max_depth}, min_samples_leaf {min_samples_leaf}:")
    train_score = decisionTree.score(X_train, y_train)
    test_score = decisionTree.score(X_test, y_test)
    print("--Predicting training data values:", round(train_score,3))
    print("--Predicting test data values:", round(test_score,3))
    return decisionTree
    
dT = generate_and_test_decision_tree(4, 3)
y_predict = dT.predict(X_test)

confusion = sklearn.metrics.confusion_matrix(y_test, y_predict)
print("\nConfusion Matrix:\n",confusion,"\n")

textrender = export_text(dT, feature_names=cols)
print(textrender)


Overall performance for max_depth 4, min_samples_leaf 3:
--Predicting training data values: 0.813
--Predicting test data values: 0.8

Confusion Matrix:
 [[60 10]
 [ 9 16]] 

|--- population_2000 <= 22850.00
|   |--- population_2000 <= 7480.50
|   |   |--- graduate_degree <= 0.22
|   |   |   |--- graduate_degree <= 0.18
|   |   |   |   |--- class: False
|   |   |   |--- graduate_degree >  0.18
|   |   |   |   |--- class: False
|   |   |--- graduate_degree >  0.22
|   |   |   |--- graduate_degree <= 0.23
|   |   |   |   |--- class: True
|   |   |   |--- graduate_degree >  0.23
|   |   |   |   |--- class: False
|   |--- population_2000 >  7480.50
|   |   |--- graduate_degree <= 0.37
|   |   |   |--- per_capita_income <= 63450.50
|   |   |   |   |--- class: False
|   |   |   |--- per_capita_income >  63450.50
|   |   |   |   |--- class: False
|   |   |--- graduate_degree >  0.37
|   |   |   |--- class: True
|--- population_2000 >  22850.00
|   |--- population_2000 <= 54320.50
|   |   |---

## Logistic Regression

In [20]:
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
import sklearn.metrics

scaler = MinMaxScaler()

cols = ["per_capita_income", "population_2000", "graduate_degree"]
X = df[cols].to_numpy()
X_scale = scaler.fit_transform(X)
y = df["college_town"].to_numpy()
X_train, X_test, y_train, y_test = train_test_split(X_scale, y, train_size=0.3, random_state=10)

logisticReg = LogisticRegression(random_state=3).fit(X_train, y_train)
y_predict = logisticReg.predict(X_test)
y_predict_probability = logisticReg.predict_proba(X_test)[::,1]
confusion = sklearn.metrics.confusion_matrix(y_test, y_predict)
print("Confusion Matrix:")
print(confusion)

report = sklearn.metrics.classification_report(y_test, y_predict, target_names=["No Colleges", "College Town"])
print(report)

false_pos_rate, true_pos_rate, _ = sklearn.metrics.roc_curve(y_test, y_predict_probability)
auc = sklearn.metrics.roc_auc_score(y_test, y_predict_probability)
plt.plot(false_pos_rate, true_pos_rate,label=f"AUC:{auc:.3f}")
plt.xlabel("False Positive Rate")
plt.ylabel("True Positive rate")
plt.legend()
plt.show()

Confusion Matrix:
[[152   0]
 [ 68   0]]


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


              precision    recall  f1-score   support

 No Colleges       0.69      1.00      0.82       152
College Town       0.00      0.00      0.00        68

    accuracy                           0.69       220
   macro avg       0.35      0.50      0.41       220
weighted avg       0.48      0.69      0.56       220

