## Machine Learning

In [62]:
import pandas as pd
import numpy as np

import warnings
warnings.filterwarnings('ignore')

In [63]:
df = pd.read_csv('HousingData.csv')

In [64]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 506 entries, 0 to 505
Data columns (total 14 columns):
CRIM       486 non-null float64
ZN         486 non-null float64
INDUS      486 non-null float64
CHAS       486 non-null float64
NOX        506 non-null float64
RM         506 non-null float64
AGE        486 non-null float64
DIS        506 non-null float64
RAD        506 non-null int64
TAX        506 non-null int64
PTRATIO    506 non-null float64
B          506 non-null float64
LSTAT      486 non-null float64
MEDV       506 non-null float64
dtypes: float64(12), int64(2)
memory usage: 55.4 KB


In [65]:
housing_df = df.dropna(how='any',axis=0) 

In [66]:
housing_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 394 entries, 0 to 504
Data columns (total 14 columns):
CRIM       394 non-null float64
ZN         394 non-null float64
INDUS      394 non-null float64
CHAS       394 non-null float64
NOX        394 non-null float64
RM         394 non-null float64
AGE        394 non-null float64
DIS        394 non-null float64
RAD        394 non-null int64
TAX        394 non-null int64
PTRATIO    394 non-null float64
B          394 non-null float64
LSTAT      394 non-null float64
MEDV       394 non-null float64
dtypes: float64(12), int64(2)
memory usage: 46.2 KB


In [67]:
X = housing_df.iloc[:,:-1]
y = housing_df.iloc[:, -1]

In [68]:
# Import necessary modules
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=22)

# Create the regressor: reg_all
reg_all = LinearRegression()

# Fit the regressor to the training data
reg_all.fit(X_train, y_train)

# Predict on the test data: y_pred
y_pred = reg_all.predict(X_test)

# Compute and print R^2 and RMSE
#print("R^2: {}".format(reg_all.score(X_test, y_test)))
rmse = np.sqrt(mean_squared_error(y_test, y_pred))
print("Root Mean Squared Error: {}".format(rmse))

Root Mean Squared Error: 3.799844098529469


In [69]:
def regression_model(model):

    # Create training and test sets
    X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
    
    # Create the regressor: reg_all
    reg = model

    # Fit the regressor to the training data
    reg.fit(X_train, y_train)

    # Predict on the test data: y_pred
    y_pred = reg.predict(X_test)

    # Compute and print RMSE
    rmse = np.sqrt(mean_squared_error(y_test, y_pred))
    print("Root Mean Squared Error: {}".format(rmse))

In [70]:
regression_model(LinearRegression())

Root Mean Squared Error: 5.600227046849506


In [71]:
regression_model(LinearRegression())

Root Mean Squared Error: 5.422840675009316


In [72]:
regression_model(LinearRegression())

Root Mean Squared Error: 4.1409513362203825


In [73]:
from sklearn.model_selection import cross_val_score

In [74]:
def regression_model_cv(model, k=5):

    # Create the regressor: reg_all
    reg = model

    scores = cross_val_score(reg, X, y, scoring='neg_mean_squared_error', cv=k)

    rmse = np.sqrt(-scores)
    print('Reg rmse:', rmse)
    print('Reg mean:', rmse.mean())

In [75]:
regression_model_cv(LinearRegression())

Reg rmse: [3.26123843 4.42712448 5.66151114 8.09493087 5.24453989]
Reg mean: 5.33786896287833


In [76]:
regression_model_cv(LinearRegression(), k=3)

Reg rmse: [ 3.72504914  6.01655701 23.20863933]
Reg mean: 10.983415161090798


In [77]:
regression_model_cv(LinearRegression(), k=6)

Reg rmse: [3.23879491 3.97041949 5.58329663 3.92861033 9.88399671 3.91442679]
Reg mean: 5.0865908108011


In [78]:
from sklearn.linear_model import Ridge

regression_model_cv(Ridge())

Reg rmse: [3.17202127 4.54972372 5.36604368 8.03715216 5.03988501]
Reg mean: 5.232965166251769


In [79]:
from sklearn.linear_model import Lasso

regression_model_cv(Lasso())

Reg rmse: [3.52318747 5.70083491 7.82318757 6.9878025  3.97229348]
Reg mean: 5.60146118538429


### Additional Models

In [80]:
from sklearn.neighbors import KNeighborsRegressor

regression_model_cv(KNeighborsRegressor())

Reg rmse: [ 8.24568226  8.81322798 10.58043836  8.85643441  5.98100069]
Reg mean: 8.495356738515685


In [81]:
regression_model_cv(KNeighborsRegressor(n_neighbors=4))

Reg rmse: [ 8.44659788  8.99814547 10.97170231  8.86647969  5.72114135]
Reg mean: 8.600813339223432


In [82]:
regression_model_cv(KNeighborsRegressor(n_neighbors=7))

Reg rmse: [ 7.99710601  8.68309183 10.66332898  8.90261573  5.51032355]
Reg mean: 8.351293217401393


In [83]:
regression_model_cv(KNeighborsRegressor(n_neighbors=10))

Reg rmse: [ 7.47549287  8.62914556 10.69543822  8.91330686  6.52982222]
Reg mean: 8.448641147609868


In [84]:
from sklearn.model_selection import GridSearchCV

# Setup the hyperparameter grid of 1-20 neighbors
neighbors = np.linspace(1, 20, 20)
# Convert floats to int (required by knn)
k = neighbors.astype(int)
# Setup the hyperparameter grid
param_grid = {'n_neighbors': k}

# Instantiate the knn regressor
knn = KNeighborsRegressor()
# Instantiate the GridSearchCV object: knn_tuned
knn_tuned = GridSearchCV(knn, param_grid, cv=5, scoring='neg_mean_squared_error')
# Fit knn_tuned to the data
knn_tuned.fit(X, y)
    
# Print the tuned parameters and score
k = knn_tuned.best_params_
print("Best n_neighbors: {}".format(k))
score = knn_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))

Best n_neighbors: {'n_neighbors': 7}
Best score: 8.523048500643897


In [85]:
from sklearn import tree
regression_model_cv(tree.DecisionTreeRegressor(random_state=22))

Reg rmse: [3.97380664 6.09699033 8.43734165 6.47179861 5.42085243]
Reg mean: 6.080157933019333


In [86]:
from sklearn.ensemble import RandomForestRegressor
regression_model_cv(RandomForestRegressor(random_state=22))

Reg rmse: [3.59863933 4.3893036  4.42057961 6.66756925 4.35756519]
Reg mean: 4.68673139580543


In [87]:
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=100, random_state=22))

Reg rmse: [3.14403083 3.80165678 4.72810099 6.47428712 4.14026328]
Reg mean: 4.457667798068795


In [88]:
from sklearn.model_selection import RandomizedSearchCV

# Setup the hyperparameter grid
param_grid = {'max_depth': [None, 10, 30, 50, 70, 100, 200, 400],
             'min_samples_split': [2, 3, 4, 5],
             'min_samples_leaf': [1, 2, 3],
             'max_features': ['auto', 'sqrt']}

# Instantiate the knn regressor
reg = RandomForestRegressor(n_jobs = -1, n_estimators=100, random_state=22)
# Instantiate the GridSearchCV object: knn_tuned
reg_tuned = RandomizedSearchCV(reg, param_grid, cv=5, scoring='neg_mean_squared_error', random_state=22)
# Fit knn_tuned to the data
reg_tuned.fit(X, y)
    
# Print the tuned parameters and score
k = reg_tuned.best_params_
print("Best n_neighbors: {}".format(k))
score = reg_tuned.best_score_
rsm = np.sqrt(-score)
print("Best score: {}".format(rsm))

Best n_neighbors: {'min_samples_split': 2, 'min_samples_leaf': 3, 'max_features': 'auto', 'max_depth': 10}
Best score: 4.56038417117328


In [89]:
regression_model_cv(RandomForestRegressor(n_jobs=-1, n_estimators=500, random_state=22))

Reg rmse: [3.17047996 3.70803725 4.90857097 6.52090291 4.06240733]
Reg mean: 4.474079685805386


### Logistic Regression

In [90]:
df = pd.read_csv('HTRU_2.csv')

In [91]:
df.head()

Unnamed: 0,140.5625,55.68378214,-0.234571412,-0.699648398,3.199832776,19.11042633,7.975531794,74.24222492,0
0,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
1,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
2,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
3,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0
4,93.570312,46.698114,0.531905,0.416721,1.636288,14.545074,10.621748,131.394004,0


In [92]:
df.columns = [['Mean of integrated profile', 'Standard deviation of integrated profile', 
               'Excess kurtosis of integrated profile', 'Skewness of integrated profile',
               'Mean of DM-SNR curve', 'Standard deviation of DM-SNR curve',
               'Excess kurtosis of DM-SNR curve', 'Skewness of DM-SNR curve', 'Class' ]]

In [93]:
df.head()

Unnamed: 0,Mean of integrated profile,Standard deviation of integrated profile,Excess kurtosis of integrated profile,Skewness of integrated profile,Mean of DM-SNR curve,Standard deviation of DM-SNR curve,Excess kurtosis of DM-SNR curve,Skewness of DM-SNR curve,Class
0,102.507812,58.88243,0.465318,-0.515088,1.677258,14.860146,10.576487,127.39358,0
1,103.015625,39.341649,0.323328,1.051164,3.121237,21.744669,7.735822,63.171909,0
2,136.75,57.178449,-0.068415,-0.636238,3.642977,20.95928,6.896499,53.593661,0
3,88.726562,40.672225,0.600866,1.123492,1.17893,11.46872,14.269573,252.567306,0
4,93.570312,46.698114,0.531905,0.416721,1.636288,14.545074,10.621748,131.394004,0


In [94]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17897 entries, 0 to 17896
Data columns (total 9 columns):
(Mean of integrated profile,)                  17897 non-null float64
(Standard deviation of integrated profile,)    17897 non-null float64
(Excess kurtosis of integrated profile,)       17897 non-null float64
(Skewness of integrated profile,)              17897 non-null float64
(Mean of DM-SNR curve,)                        17897 non-null float64
(Standard deviation of DM-SNR curve,)          17897 non-null float64
(Excess kurtosis of DM-SNR curve,)             17897 non-null float64
(Skewness of DM-SNR curve,)                    17897 non-null float64
(Class,)                                       17897 non-null int64
dtypes: float64(8), int64(1)
memory usage: 1.2 MB


In [95]:
len(df)

17897

In [96]:
y = df.iloc[:, -1]
X = df.iloc[:,:-1]

In [97]:
from sklearn.linear_model import LogisticRegression

In [98]:
def clf_model(model, cv=3):

    # Create the classifier clf
    clf = model

    scores = cross_val_score(clf, X, y, cv=cv)

    print('Scores:', scores)
    print('Mean score:', scores.mean())

In [99]:
clf_model(LogisticRegression())

Scores: [0.9740238  0.98222967 0.9766974 ]
Mean score: 0.9776502907183512


In [100]:
from sklearn.neighbors import KNeighborsClassifier
clf_model(KNeighborsClassifier())

Scores: [0.96899615 0.97200335 0.97082984]
Mean score: 0.9706097796987464


In [101]:
from sklearn.naive_bayes import GaussianNB
clf_model(GaussianNB())

Scores: [0.95692978 0.92472758 0.94836547]
Mean score: 0.9433409410695212


In [102]:
from sklearn.tree import DecisionTreeClassifier
clf_model(DecisionTreeClassifier(random_state=22))

Scores: [0.96732026 0.96194468 0.96697402]
Mean score: 0.9654129846033599


In [103]:
from sklearn.ensemble import RandomForestClassifier
clf_model(RandomForestClassifier(random_state=22))

Scores: [0.97620245 0.98005029 0.97636211]
Mean score: 0.9775382841635327


In [104]:
df.Class.count()

Class    17897
dtype: int64

In [105]:
df[df.Class == 1].Class.count()

Class    1639
dtype: int64

In [106]:
df[df.Class == 1].Class.count()/df.Class.count()

Class    0.09158
dtype: float64

In [107]:
from sklearn.metrics import classification_report
from sklearn.metrics import confusion_matrix

# Create training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state=22)

def confusion(model):

    # Create a multinomial classifier
    clf = model

    # Fit the classifier to the data
    clf.fit(X_train, y_train)

    # Predict the labels of the test set: y_pred
    y_pred = clf.predict(X_test)

    # Compute and print the confusion matrix and classification report
    print('Confusion Matrix:', confusion_matrix(y_test, y_pred))
    print('Classification Report:', classification_report(y_test, y_pred))
    
    return clf

In [108]:
confusion(LogisticRegression())

Confusion Matrix: [[4005   23]
 [  78  369]]
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4028
           1       0.94      0.83      0.88       447

    accuracy                           0.98      4475
   macro avg       0.96      0.91      0.93      4475
weighted avg       0.98      0.98      0.98      4475



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=None, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [109]:
confusion(KNeighborsClassifier())

Confusion Matrix: [[3994   34]
 [  95  352]]
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.99      0.98      4028
           1       0.91      0.79      0.85       447

    accuracy                           0.97      4475
   macro avg       0.94      0.89      0.91      4475
weighted avg       0.97      0.97      0.97      4475



KNeighborsClassifier(algorithm='auto', leaf_size=30, metric='minkowski',
                     metric_params=None, n_jobs=None, n_neighbors=5, p=2,
                     weights='uniform')

In [110]:
confusion(GaussianNB())

Confusion Matrix: [[3848  180]
 [  68  379]]
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.96      0.97      4028
           1       0.68      0.85      0.75       447

    accuracy                           0.94      4475
   macro avg       0.83      0.90      0.86      4475
weighted avg       0.95      0.94      0.95      4475



GaussianNB(priors=None, var_smoothing=1e-09)

In [111]:
confusion(DecisionTreeClassifier(random_state=22))

Confusion Matrix: [[3949   79]
 [  69  378]]
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.98      0.98      4028
           1       0.83      0.85      0.84       447

    accuracy                           0.97      4475
   macro avg       0.90      0.91      0.91      4475
weighted avg       0.97      0.97      0.97      4475



DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
                       max_features=None, max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, presort=False,
                       random_state=22, splitter='best')

In [112]:
confusion(RandomForestClassifier(random_state=22))

Confusion Matrix: [[4001   27]
 [  73  374]]
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4028
           1       0.93      0.84      0.88       447

    accuracy                           0.98      4475
   macro avg       0.96      0.91      0.93      4475
weighted avg       0.98      0.98      0.98      4475



RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=None, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=1, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=10,
                       n_jobs=None, oob_score=False, random_state=22, verbose=0,
                       warm_start=False)

In [113]:
from sklearn.ensemble import AdaBoostClassifier
clf_model(AdaBoostClassifier())

Scores: [0.97519692 0.98122381 0.97652976]
Mean score: 0.9776501596069993


In [114]:
confusion(AdaBoostClassifier())

Confusion Matrix: [[4002   26]
 [  72  375]]
Classification Report:               precision    recall  f1-score   support

           0       0.98      0.99      0.99      4028
           1       0.94      0.84      0.88       447

    accuracy                           0.98      4475
   macro avg       0.96      0.92      0.94      4475
weighted avg       0.98      0.98      0.98      4475



AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None, learning_rate=1.0,
                   n_estimators=50, random_state=None)

In [115]:
X = housing_df.iloc[:,:-1]
y = housing_df.iloc[:, -1]

In [116]:
from sklearn.ensemble import AdaBoostRegressor
regression_model_cv(AdaBoostRegressor(random_state=22))

Reg rmse: [3.43931042 3.33956016 5.5065491  6.37311137 4.47250745]
Reg mean: 4.626207700032104
