# Example: Bag of Words

In [1]:
import numpy as np 
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, BaggingClassifier, AdaBoostClassifier, VotingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC

In [2]:
df = pd.read_csv("../data/clean_compiled_transcripts.csv")

CV = CountVectorizer(min_df=3) # ignore terms that appear in less than 3 documents

X = CV.fit_transform(df.Transcript).toarray()
y = df.PHQ_Binary.to_numpy()

RANDOM_STATE = 42 # lucky number

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=RANDOM_STATE)
X_train.shape, X_test.shape

((144, 2902), (37, 2902))

In [3]:
# Decision Tree

dtc = DecisionTreeClassifier()

dtc.fit(X_train, y_train)
print(dtc.score(X_test, y_test))
print(dtc.score(X_train, y_train)) # clearly overfitting

0.5945945945945946
1.0


In [4]:
# Random Forest - Ensemble of Decision Trees

rf = RandomForestClassifier(n_estimators=20)

rf.fit(X_train, y_train)
print(rf.score(X_test, y_test)) # improvement in score
print(rf.score(X_train, y_train))

0.7297297297297297
0.9861111111111112


In [5]:
# Bagging 

bg = BaggingClassifier(DecisionTreeClassifier(), max_samples=0.5, max_features=1.0, n_estimators=20)
bg.fit(X_train, y_train)

print(bg.score(X_test, y_test))
print(bg.score(X_train, y_train)) # less overfitting

0.6756756756756757
0.9305555555555556


In [6]:
# Boosting - Ada Boost

adb = AdaBoostClassifier(DecisionTreeClassifier(), n_estimators=5, learning_rate=1)
adb.fit(X_train, y_train)

print(adb.score(X_test, y_test))
print(adb.score(X_train, y_train))

0.6756756756756757
1.0


In [7]:
# Voting Classifier - Multiple Model Ensemble

lr = LogisticRegression()
dtc = DecisionTreeClassifier()
svm = SVC(kernel='poly', degree=2, probability=True)

# Hard voting: Suppose three classifiers predicted the output class(A, A, B), follow the majority which predicts A as output
# Soft voting: Suppose the prediction probability for class A = (0.30, 0.47, 0.53) and B = (0.20, 0.32, 0.40), so the
# average for class A is 0.4333 and B is 0.3067, so A is predicted as the output

evc = VotingClassifier(estimators= [('lr', lr), ('dt', dtc), ('svm', svm)], voting='hard')
evc.fit(X_train, y_train)

print(evc.score(X_test, y_test))
print(evc.score(X_train, y_train))

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression


0.7027027027027027
1.0
