In [6]:
#5.1
from pydataset import data
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import RandomForestClassifier
from sklearn import cross_validation, svm
from sklearn.metrics import confusion_matrix

def convert_dummy(col,row):
    if row[col] > 0:
        return 1
    else:
        return 0

df = data('affairs')
df['naffairs'] = df.apply(lambda row: convert_dummy('naffairs',row),axis=1)
X = df[['kids', 'vryunhap','unhap','avgmarr','hapavg','vryhap','antirel','notrel','slghtrel','smerel',
          'vryrel','yrsmarr1','yrsmarr2','yrsmarr3','yrsmarr4','yrsmarr5','yrsmarr6']].values
y = df.naffairs.values

clf = RandomForestClassifier()
param_grid = {
    'n_estimators': [10, 100, 1000],
    'max_features': ['auto', 'sqrt', 'log2']
}

rs = GridSearchCV(clf, cv=5, param_grid=param_grid)
rs.fit(X,y)
print(rs.best_estimator_)

# 5.1 Best estimator is max_features=auto, n_estimator=1000
clf = RandomForestClassifier(n_estimators=1000, max_features='auto')
clf.fit(X,y)
y_pred = clf.predict(X)

naffair, nacorr, waffair, wacorr = 0, 0, 0, 0
for i in range(len(y)):
    if y[i] == 0:
        naffair = naffair + 1
        if y_pred[i] == 0:
            nacorr = nacorr + 1
    else:
        waffair = waffair + 1
        if y_pred[i] == 1:
            wacorr = wacorr + 1

naaccu = float(nacorr) / naffair
waaccu =  float(wacorr)/waffair
perclassaccu = (naaccu + waaccu) / 2
print("The confusion matrix is:\n", confusion_matrix(y,y_pred))
print("The accuracy for no affair class is:", naaccu)
print("The accuracy for affair class is:", waaccu)
print("The average per-class accuracy is:", perclassaccu)

importances = clf.feature_importances_
print(importances)
# 5.2
# N_estimators shows the number of trees in the forest.
# Max_features shows the number of features to consider when looking for the best split:
# 5.3
# From the weight table, we can see that features are ranked as follows in terms of weight:
# (from most to least important) kids, vryhap, antirel, unhap,  ... yrsmarr2, vryrel, yrsmarr1

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features='log2', max_leaf_nodes=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=False, random_state=None, verbose=0,
            warm_start=False)
The confusion matrix is:
 [[437  14]
 [103  47]]
The accuracy for no affair class is: 0.9689578713968958
The accuracy for affair class is: 0.31333333333333335
The average per-class accuracy is: 0.6411456023651145
[ 0.10455787  0.03374118  0.08802575  0.05311752  0.05932662  0.07760034
  0.06403705  0.05787703  0.06596279  0.05867132  0.03929155  0.03441239
  0.05024605  0.05357223  0.05463316  0.04880724  0.0561199 ]


In [8]:
#5.4
from pydataset import data
from sklearn.grid_search import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn import cross_validation, svm
from sklearn.metrics import confusion_matrix

def convert_dummy(col,row):
    if row[col] > 0:
        return 1
    else:
        return 0

df = data('affairs')
df['naffairs'] = df.apply(lambda row: convert_dummy('naffairs',row),axis=1)
X = df[['kids', 'vryunhap','unhap','avgmarr','hapavg','vryhap','antirel','notrel','slghtrel','smerel',
          'vryrel','yrsmarr1','yrsmarr2','yrsmarr3','yrsmarr4','yrsmarr5','yrsmarr6']].values
y = df.naffairs.values

clf = AdaBoostClassifier()
param_grid = {
    'n_estimators': [50, 500, 5000],
    'learning_rate': [0.001, 0.01, 0.1]
}

rs = GridSearchCV(clf, cv=5, param_grid=param_grid)
rs.fit(X,y)
print(rs.best_estimator_)
# 5.1 Best estimator is learning_rate = 0.001, n_estimator=5000

clf = AdaBoostClassifier(n_estimators=5000, learning_rate=0.001)
clf.fit(X, y)
y_pred = clf.predict(X)

naffair, nacorr, waffair, wacorr = 0, 0, 0, 0
for i in range(len(y)):
    if y[i] == 0:
        naffair = naffair + 1
        if y_pred[i] == 0:
            nacorr = nacorr + 1
    else:
        waffair = waffair + 1
        if y_pred[i] == 1:
            wacorr = wacorr + 1

naaccu = float(nacorr) / naffair
waaccu =  float(wacorr)/waffair
perclassaccu = (naaccu + waaccu) / 2
print("The confusion matrix is:\n", confusion_matrix(y,y_pred))
print("The accuracy for no affair class is:", naaccu)
print("The accuracy for affair class is:", waaccu)
print("The average per-class accuracy is:", perclassaccu)

importances = clf.feature_importances_
print(importances)
# 5.5
# Learning rate shrinks the contribution of each classifier by learning_rate.
# It determines how much we learn from each new training data.
# 5.6
# From the weight table, we can see that features are ranked as follows in terms of weight:
# (from most to least important) unhap, vryhap, antirel, yrsmarr2, smerel, yrsmarr1, 
# slghtrel, kids, vryunhap, and the rest has no impact on the final result.

AdaBoostClassifier(algorithm='SAMME.R', base_estimator=None,
          learning_rate=0.001, n_estimators=5000, random_state=None)
The confusion matrix is:
 [[451   0]
 [148   2]]
The accuracy for no affair class is: 1.0
The accuracy for affair class is: 0.013333333333333334
The average per-class accuracy is: 0.5066666666666667
[ 0.0576  0.051   0.1836  0.      0.      0.1522  0.14    0.      0.066
  0.1184  0.      0.1074  0.1238  0.      0.      0.      0.    ]


In [None]:
# 5.7 The Random Forest has on average better performance average AdaBoost.
# Compare with all four methods, Random Forest > SVC > AdaBoost (I don't see how q3 is related to this question...)