In [74]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV

In [75]:
df = pd.read_excel("teacher_utterance_labels.xlsx")
df.sample()

Unnamed: 0,transcript_id,utterance_id,teacher_utterance_number,text,gold_standard
10547,32,3929,48,All right. I want you to go to the next page. ...,math instruction


In [76]:
df["math_instruction"] = np.where(df.gold_standard == "math instruction", 1, 0)
df.math_instruction.value_counts()

math_instruction
1    8564
0    2238
Name: count, dtype: int64

In [77]:
# Split into training, dev, and testing at the transcript level

TRAIN_RATIO = 0.6
DEV_RATIO = 0.2
TEST_RATIO = 0.2
np.random.seed(5643)

temp_df = df.copy()
temp_df = temp_df[["transcript_id", "utterance_id"]].groupby(["transcript_id"]).nunique()
temp_df['random_number'] = np.random.randint(1, 10001, size=len(temp_df))
temp_df = temp_df.sort_values(by = ["random_number"])


size = len(temp_df)
train_size = int(TRAIN_RATIO * size)
dev_size = int(DEV_RATIO * size)
test_size = size - train_size - dev_size

temp_df["training_split"] = ["train"]*train_size + ["dev"]*dev_size + ["test"]*test_size

df = df.merge(temp_df[["training_split"]], left_on = "transcript_id", right_index = True)
df = df.set_index("utterance_id")
df.sample(5)

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4505,37,75,We already know we have time we can say 10 and...,math instruction,1,test
3252,26,197,was thata good time to find thed rods and use ...,math instruction,1,train
9149,81,10,"It's not the 14th That's the 17th Okay, who ca...",math instruction,1,dev
3089,26,32,"Pencils behind you. Next to you, but not in yo...",classroom management,0,train
6804,58,117,But that wouldn't be a good strategy to write ...,math instruction,1,train


In [78]:
df.training_split.value_counts()

training_split
train    6544
dev      2273
test     1985
Name: count, dtype: int64

In [79]:
print(df[df.training_split == "train"].transcript_id.nunique())
print(df[df.training_split == "dev"].transcript_id.nunique())
print(df[df.training_split == "test"].transcript_id.nunique())

66
22
23


In [80]:
train_indices = list(df[df.training_split == "train"].index)
dev_indices = list(df[df.training_split == "dev"].index)
test_indices = list(df[df.training_split == "test"].index)

In [81]:
df

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9291,81,152,We're gonna put these aside. How many of you h...,math instruction,1,dev
9168,81,29,"Let's count with him,Ready? So 1 2 3 4 5 6, oh...",math instruction,1,dev
9186,81,47,1 2 3 4 It does. And we knew that because they...,classroom management,0,dev
9324,81,185,and I took it apart. I did. I need you to plea...,classroom management,0,dev
9156,81,17,Do we say 63 day or what? 63rd day. remember w...,math instruction,1,dev
...,...,...,...,...,...,...
10906,100,25,Cross your arms again. Put your hands together...,classroom management,0,dev
10923,100,47,how me people check out a (inaudible) yeah go ...,classroom management,0,dev
10888,100,3,Alex could you help him two keep that in mind...,math instruction,1,dev
10895,100,13,"Hayden. We think it's minus three, three plus ...",math instruction,1,dev


# Pre-Processing

#### Lematization and Removal of Domain Specific Stop Words

In [82]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data (if not already downloaded)
nltk.download('punkt') # For word_tokenize
nltk.download('wordnet') # For WordNetLemmatizer

# Custom stop words list
stop_words_classroom = [
    "a", "an", "the", # Articles
    "he", "she", "it", "they", # Pronouns
    "in", "on", "at", "from", # Prepositions
    "and", "but", "or", # Conjunctions
    "be", "have", "do", "is", "am", "are", "was", "were", # Auxiliary Verbs
    "say", "go", "get", "see", "know", "think", # Other Common Verbs
    "some", "any", "much", "many", # Quantifiers
    "always", "often", "sometimes", "never", # Adverbs of Frequency
    "can", "could", "may", "might", "will", "would", "should", # Modal Verbs
    "well", "so", "um", "uh", "like" # Filler Words
]

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenization and lemmatization
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    
    # Stop words removal
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words_classroom]
    
    return " ".join(filtered_tokens)

# Apply preprocessing to the text column
df['lematized_no_stop_text'] = df['text'].apply(preprocess_text)

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [83]:
df.head()

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split,lematized_no_stop_text
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9291,81,152,We're gonna put these aside. How many of you h...,math instruction,1,dev,we 're gon na put these aside . how of you all...
9168,81,29,"Let's count with him,Ready? So 1 2 3 4 5 6, oh...",math instruction,1,dev,"let 's count with him , ready ? 1 2 3 4 5 6 , ..."
9186,81,47,1 2 3 4 It does. And we knew that because they...,classroom management,0,dev,1 2 3 4 doe . we knew that because 're related...
9324,81,185,and I took it apart. I did. I need you to plea...,classroom management,0,dev,i took apart . i did . i need you to please to...
9156,81,17,Do we say 63 day or what? 63rd day. remember w...,math instruction,1,dev,we 63 day what ? 63rd day . remember we 're ta...


### TF-IDF after Lemmatization & Stop Word Removal

In [84]:
# Define TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df = 0.01)

# Fit and transform the processed text
tfidf_features = tfidf_vectorizer.fit_transform(df['lematized_no_stop_text'])
tfidf_matrix = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=df.index)

In [85]:
tfidf_matrix

Unnamed: 0_level_0,10,100,10s,11,12,13,14,15,16,17,...,wrote,yeah,yep,yes,yesterday,yet,you,your,yours,zero
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9291,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.351078,0.000000,0.0,0.0
9168,0.121956,0.0,0.0,0.228257,0.211335,0.241161,0.230335,0.235945,0.256515,0.267562,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
9186,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.165237,0.141072,0.0,0.0
9324,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.101656,0.173579,0.0,0.0
9156,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.047629,0.406637,0.0,0.0
10923,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.106725,0.0,0.0,0.0,0.0,0.129759,0.332347,0.0,0.0
10888,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.191142,0.000000,0.0,0.0
10895,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0


### Count Vectorizer after Lemmatization & Stop Word Removal

In [86]:
vec = CountVectorizer(min_df = 0.01)
X = vec.fit_transform(df['lematized_no_stop_text'])
cv_matrix = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out(), index=df.index)

In [87]:
cv_matrix

Unnamed: 0_level_0,10,100,10s,11,12,13,14,15,16,17,...,wrote,yeah,yep,yes,yesterday,yet,you,your,yours,zero
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,0,0,0
9168,1,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
9186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,2,0,0
9324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
9156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,5,0,0
10923,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,4,6,0,0
10888,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,7,0,0,0
10895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## **Because we are going to perform 5-Fold Cross Validation, we do not need a seperate holdout dev set. This dev set will be created from the train set during the cross validation process. Therefore, we will merge the train_idicies and dev_indices into a single set of indices for the cross validation process instead of just throwing it away.**

### X & y Training Split

In [88]:
tf_train_X = tfidf_matrix[tfidf_matrix.index.isin(train_indices)]
tf_train_y = df[df.index.isin(train_indices)].math_instruction

cv_train_X = cv_matrix[cv_matrix.index.isin(train_indices)]
cv_train_y = df[df.index.isin(train_indices)].math_instruction

### X & y Dev Split

In [89]:
tf_dev_X = tfidf_matrix[tfidf_matrix.index.isin(dev_indices)]
tf_dev_y = df[df.index.isin(dev_indices)].math_instruction

cv_dev_X = cv_matrix[cv_matrix.index.isin(dev_indices)]
cv_dev_y = df[df.index.isin(dev_indices)].math_instruction

### Combining Train & Dev Indices for Cross Validation

In [90]:
tf_train_dev_X = tfidf_matrix[tfidf_matrix.index.isin(train_indices + dev_indices)]
tf_train_dev_y = df[df.index.isin(train_indices + dev_indices)].math_instruction

cv_train_dev_X = cv_matrix[cv_matrix.index.isin(train_indices + dev_indices)]
cv_train_dev_y = df[df.index.isin(train_indices + dev_indices)].math_instruction

### X & y Testing Split

In [91]:
tf_test_X = tfidf_matrix[tfidf_matrix.index.isin(test_indices)]
tf_test_y = df[df.index.isin(test_indices)].math_instruction

cv_test_X = cv_matrix[cv_matrix.index.isin(test_indices)]
cv_test_y = df[df.index.isin(test_indices)].math_instruction

In [92]:
df_token_tf_train_dev = df.merge(tf_train_dev_X, left_index = True, right_index=True)

df_token_cv_train_dev = df.merge(cv_train_dev_X, left_index = True, right_index=True)

## Random Forest Classifier for TF-IDF Model 1

In [107]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

In [108]:
# Define the Random Forest model
tf_random_forest_model_1 = RandomForestClassifier(random_state=5643)

# Define the grid of parameters to search over
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
}

f1_scorer = make_scorer(f1_score)

# Setup GridSearchCV
grid_search_cv = GridSearchCV(estimator=tf_random_forest_model_1, param_grid=param_grid, cv=5, scoring=f1_scorer, n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search_cv.fit(tf_train_X, tf_train_y)

print("Best parameters:", grid_search_cv.best_params_)

# Access the best model
best_model = grid_search_cv.best_estimator_

# Use the best model for predictions
predictions = best_model.predict(tf_dev_X)

# Evaluate the model's performance
accuracy = accuracy_score(y_true = tf_dev_y, y_pred = predictions, )
precision = precision_score(y_true = tf_dev_y, y_pred = predictions)
recall = recall_score(y_true = tf_dev_y, y_pred = predictions)
f1 = f1_score(y_true = tf_dev_y, y_pred = predictions)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)

Best parameters: {'n_estimators': 400}
Accuracy: 0.8583369995600528
Precision: 0.8569284642321161
Recall: 0.9794168096054888
F1: 0.9140875133404482


## Random Forest Classifier for CV Model 1

In [109]:
# Define the Random Forest model
cv_random_forest_model_1 = RandomForestClassifier(random_state=5643)

# Define the grid of parameters to search over
param_grid = {
    'n_estimators': [100, 200, 300, 400, 500],
}

f1_scorer = make_scorer(f1_score)

# Setup GridSearchCV
grid_search_cv = GridSearchCV(estimator=cv_random_forest_model_1, param_grid=param_grid, cv=5, scoring=f1_scorer, n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search_cv.fit(cv_train_X, cv_train_y)

print("Best parameters:", grid_search_cv.best_params_)

# Access the best model
best_model = grid_search_cv.best_estimator_

# Use the best model for predictions
predictions = best_model.predict(cv_dev_X)

# Evaluate the model's performance
accuracy = accuracy_score(y_true = cv_dev_y, y_pred = predictions, )
precision = precision_score(y_true = cv_dev_y, y_pred = predictions)
recall = recall_score(y_true = cv_dev_y, y_pred = predictions)
f1 = f1_score(y_true = cv_dev_y, y_pred = predictions)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)

Best parameters: {'n_estimators': 100}
Accuracy: 0.8644962604487462
Precision: 0.861878453038674
Recall: 0.9811320754716981
F1: 0.9176470588235295


## Random Forest Classifier for TF-IDF Model 2

In [121]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer


# Define the Random Forest model
tf_random_forest_model_2 = RandomForestClassifier(random_state=5643)

# Define the grid of parameters to search over
param_grid = {
    'bootstrap': [True, False] 
}

f1_scorer = make_scorer(f1_score)

# Setup GridSearchCV
grid_search_cv = GridSearchCV(estimator=tf_random_forest_model_2, param_grid=param_grid, cv=5, scoring=f1_scorer, n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search_cv.fit(tf_train_X, tf_train_y)

print("Best parameters:", grid_search_cv.best_params_)

# Access the best model
best_model = grid_search_cv.best_estimator_

# Use the best model for predictions
predictions = best_model.predict(tf_dev_X)

# Evaluate the model's performance
accuracy = accuracy_score(y_true = tf_dev_y, y_pred = predictions, )
precision = precision_score(y_true = tf_dev_y, y_pred = predictions)
recall = recall_score(y_true = tf_dev_y, y_pred = predictions)
f1 = f1_score(y_true = tf_dev_y, y_pred = predictions)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)

Best parameters: {'bootstrap': False}
Accuracy: 0.8675758908930928
Precision: 0.8667679837892603
Recall: 0.978273299028016
F1: 0.9191512221326886


## Random Forest Classifier for TF-IDF Model 2

In [120]:
# Define the Random Forest model
cv_random_forest_model_2 = RandomForestClassifier(random_state=5643)

# Define the grid of parameters to search over
param_grid = {
    'bootstrap': [True, False]  # Method of selecting samples for training each tree
}
f1_scorer = make_scorer(f1_score)

# Setup GridSearchCV
grid_search_cv = GridSearchCV(estimator=cv_random_forest_model_2, param_grid=param_grid, cv=5, scoring=f1_scorer, n_jobs=-1)

# Fit GridSearchCV on the training data
grid_search_cv.fit(cv_train_X, cv_train_y)

print("Best parameters:", grid_search_cv.best_params_)

# Access the best model
best_model = grid_search_cv.best_estimator_

# Use the best model for predictions
predictions = best_model.predict(cv_dev_X)

# Evaluate the model's performance
accuracy = accuracy_score(y_true = cv_dev_y, y_pred = predictions, )
precision = precision_score(y_true = cv_dev_y, y_pred = predictions)
recall = recall_score(y_true = cv_dev_y, y_pred = predictions)
f1 = f1_score(y_true = cv_dev_y, y_pred = predictions)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)

Best parameters: {'bootstrap': False}
Accuracy: 0.8702155741311043
Precision: 0.8709183673469387
Recall: 0.9759862778730704
F1: 0.9204637368562955


# Gradient Boosting

In [122]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.neural_network import MLPClassifier

from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, make_scorer

In [123]:
param_grid = {
    'max_depth': [3,4,5]  # Maximum depth of the individual trees
}
f1_scorer = make_scorer(f1_score)
boosted_forest = GradientBoostingClassifier()

grid_search = GridSearchCV(boosted_forest, param_grid, cv=StratifiedKFold(n_splits=5), scoring = f1_scorer)
grid_search.fit(tf_train_X, tf_train_y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_params

{'max_depth': 5}

In [124]:
tf_gb_depth_model = GradientBoostingClassifier(max_depth = best_params["max_depth"])
tf_gb_depth_model.fit(tf_train_X, tf_train_y)

In [125]:
param_grid = {
    'max_depth': [3,4,5]  # Maximum depth of the individual trees
}
f1_scorer = make_scorer(f1_score)
boosted_forest = GradientBoostingClassifier()

grid_search = GridSearchCV(boosted_forest, param_grid, cv=StratifiedKFold(n_splits=5), scoring = f1_scorer)
grid_search.fit(cv_train_X, cv_train_y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_params

{'max_depth': 5}

In [126]:
cv_gb_depth_model = GradientBoostingClassifier(max_depth = best_params["max_depth"])
cv_gb_depth_model.fit(cv_train_X, cv_train_y)

In [127]:
param_grid = {
    'n_estimators': [50, 150],  # Number of boosting stages
}
f1_scorer = make_scorer(f1_score)
boosted_forest = GradientBoostingClassifier()

grid_search = GridSearchCV(boosted_forest, param_grid, cv=StratifiedKFold(n_splits=5), scoring = f1_scorer)
grid_search.fit(tf_train_X, tf_train_y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_params

{'n_estimators': 150}

In [128]:
tf_gb_estimators_model = GradientBoostingClassifier(n_estimators = best_params["n_estimators"])
tf_gb_estimators_model.fit(tf_train_X, tf_train_y)

In [129]:
param_grid = {
    'n_estimators': [50, 150],  # Number of boosting stages
}
f1_scorer = make_scorer(f1_score)
boosted_forest = GradientBoostingClassifier()

grid_search = GridSearchCV(boosted_forest, param_grid, cv=StratifiedKFold(n_splits=5), scoring = f1_scorer)
grid_search.fit(cv_train_X, cv_train_y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_params

{'n_estimators': 150}

In [130]:
cv_gb_estimators_model = GradientBoostingClassifier(n_estimators = best_params["n_estimators"])
cv_gb_estimators_model.fit(cv_train_X, cv_train_y)

In [131]:
test_predictions = tf_gb_depth_model.predict(tf_dev_X)

accuracy = accuracy_score(y_true = tf_dev_y, y_pred = test_predictions)
precision = precision_score(y_true = tf_dev_y, y_pred = test_predictions)
recall = recall_score(y_true = tf_dev_y, y_pred = test_predictions)
f1 = f1_score(y_true = tf_dev_y, y_pred = test_predictions)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)

Accuracy: 0.8609766827980643
Precision: 0.8687596500257334
Recall: 0.9651229273870783
F1: 0.9144095341278439


In [132]:
test_predictions = cv_gb_depth_model.predict(cv_dev_X)

accuracy = accuracy_score(y_true = cv_dev_y, y_pred = test_predictions)
precision = precision_score(y_true = cv_dev_y, y_pred = test_predictions)
recall = recall_score(y_true = cv_dev_y, y_pred = test_predictions)
f1 = f1_score(y_true = cv_dev_y, y_pred = test_predictions)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)

Accuracy: 0.8622965244170699
Precision: 0.8751306165099269
Recall: 0.9576901086335049
F1: 0.9145509145509144


In [133]:
test_predictions = tf_gb_estimators_model.predict(tf_dev_X)

accuracy = accuracy_score(y_true = tf_dev_y, y_pred = test_predictions)
precision = precision_score(y_true = tf_dev_y, y_pred = test_predictions)
recall = recall_score(y_true = tf_dev_y, y_pred = test_predictions)
f1 = f1_score(y_true = tf_dev_y, y_pred = test_predictions)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)

Accuracy: 0.857017157941047
Precision: 0.8643807574206755
Recall: 0.9656946826758147
F1: 0.912233324331623


In [134]:
test_predictions = cv_gb_estimators_model.predict(cv_dev_X)

accuracy = accuracy_score(y_true = cv_dev_y, y_pred = test_predictions)
precision = precision_score(y_true = cv_dev_y, y_pred = test_predictions)
recall = recall_score(y_true = cv_dev_y, y_pred = test_predictions)
f1 = f1_score(y_true = cv_dev_y, y_pred = test_predictions)

print('Accuracy:', accuracy)
print('Precision:', precision)
print('Recall:', recall)
print('F1:', f1)

Accuracy: 0.8561372635283766
Precision: 0.8646153846153846
Recall: 0.9639794168096055
F1: 0.9115977291159773


### Logistic Regression with LASSO penalty

First we will run a logistic regression with a penalty on the L1 norm on the size of $\beta$. We implement 5-fold cross validation to pick the optimal $\lambda$ value. We do this for both the tfidf and the count vectorizer data.

In [135]:
lasso_model = LogisticRegression(penalty='l1', random_state = 5643, solver = 'saga')
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
tf_grid_search_lasso = GridSearchCV(lasso_model, param_grid, cv=StratifiedKFold(n_splits=5), scoring = "accuracy")
tf_grid_search_lasso.fit(tf_train_X, tf_train_y)

tf_best_params_lasso = tf_grid_search_lasso.best_params_
tf_best_score_lasso = tf_grid_search_lasso.best_score_
#print(tf_best_params_lasso)



In [136]:
from sklearn.metrics import accuracy_score, precision_score, recall_score, f1_score, confusion_matrix

In [137]:
tf_dev_preds_lasso = tf_grid_search_lasso.predict(tf_dev_X)
tf_lasso_acc = accuracy_score(y_true = tf_dev_y, y_pred = tf_dev_preds_lasso)
tf_lasso_pre = precision_score(y_true = tf_dev_y, y_pred = tf_dev_preds_lasso)
tf_lasso_rec = recall_score(y_true = tf_dev_y, y_pred = tf_dev_preds_lasso)
tf_lasso_f1 = f1_score(y_true = tf_dev_y, y_pred = tf_dev_preds_lasso)
print("Accuracy: ", tf_lasso_acc)
print("Precision: ", tf_lasso_pre)
print("Recall: ", tf_lasso_rec)
print("F1 Score: ", tf_lasso_f1)

Accuracy:  0.8891333040035195
Precision:  0.9043760129659644
Recall:  0.9571183533447685
F1 Score:  0.93


We repeat the methods above but with the Count Vectorizer method instead of tf-idf

In [138]:
logistic_model = LogisticRegression(penalty='l1', random_state = 5643, solver = 'saga')
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
cv_grid_search_lasso = GridSearchCV(logistic_model, param_grid, cv=StratifiedKFold(n_splits=5), scoring = "accuracy")
cv_grid_search_lasso.fit(cv_train_X, cv_train_y)

cv_best_params_lasso = cv_grid_search_lasso.best_params_
cv_best_score_lasso = cv_grid_search_lasso.best_score_



In [139]:
cv_dev_preds_lasso = cv_grid_search_lasso.predict(cv_dev_X)
cv_lasso_acc = accuracy_score(y_true = cv_dev_y, y_pred = cv_dev_preds_lasso)
cv_lasso_pre = precision_score(y_true = cv_dev_y, y_pred = cv_dev_preds_lasso)
cv_lasso_rec = recall_score(y_true = cv_dev_y, y_pred = cv_dev_preds_lasso)
cv_lasso_f1 = f1_score(y_true = cv_dev_y, y_pred = cv_dev_preds_lasso)
print("Accuracy: ", cv_lasso_acc)
print("Precision: ", cv_lasso_pre)
print("Recall: ", cv_lasso_rec)
print("F1 Score: ", cv_lasso_f1)

Accuracy:  0.8891333040035195
Precision:  0.9128516271373415
Recall:  0.9462550028587764
F1 Score:  0.9292532285233015


Next we will run a logistic regression with a penalty on the L2 norm on the size of $\beta$. We implement 5-fold cross validation to pick the optimal $\lambda$ value. We do this for both the tfidf and the count vectorizer data.

In [140]:
ridge_model = LogisticRegression(penalty='l2', random_state = 5643, solver = 'saga')
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
tf_grid_search_ridge = GridSearchCV(ridge_model, param_grid, cv=StratifiedKFold(n_splits=5), scoring = "accuracy")
tf_grid_search_ridge.fit(tf_train_X, tf_train_y)

tf_best_params_ridge = tf_grid_search_ridge.best_params_
tf_best_score_ridge = tf_grid_search_ridge.best_score_



In [141]:
tf_dev_preds_ridge = tf_grid_search_ridge.predict(tf_dev_X)
tf_ridge_acc = accuracy_score(y_true = tf_dev_y, y_pred = tf_dev_preds_ridge)
tf_ridge_pre = precision_score(y_true = tf_dev_y, y_pred = tf_dev_preds_ridge)
tf_ridge_rec = recall_score(y_true = tf_dev_y, y_pred = tf_dev_preds_ridge)
tf_ridge_f1 = f1_score(y_true = tf_dev_y, y_pred = tf_dev_preds_ridge)
print("Accuracy: ", tf_ridge_acc)
print("Precision: ", tf_ridge_pre)
print("Recall: ", tf_ridge_rec)
print("F1 Score: ", tf_ridge_f1)

Accuracy:  0.8926528816542015
Precision:  0.9155162893429045
Recall:  0.9479702687249857
F1 Score:  0.9314606741573035


We repeat the methods above but with the Count Vectorizer method instead of tf-idf

In [142]:
ridge_model = LogisticRegression(penalty='l2', random_state = 5643, solver = 'saga')
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
cv_grid_search_ridge = GridSearchCV(ridge_model, param_grid, cv=StratifiedKFold(n_splits=5), scoring = "accuracy")
cv_grid_search_ridge.fit(cv_train_X, cv_train_y)

cv_best_params_ridge = cv_grid_search_ridge.best_params_
cv_best_score_ridge = cv_grid_search_ridge.best_score_



In [143]:
cv_dev_preds_ridge = cv_grid_search_ridge.predict(cv_dev_X)
cv_ridge_acc = accuracy_score(y_true = cv_dev_y, y_pred = cv_dev_preds_ridge)
cv_ridge_pre = precision_score(y_true = cv_dev_y, y_pred = cv_dev_preds_ridge)
cv_ridge_rec = recall_score(y_true = cv_dev_y, y_pred = cv_dev_preds_ridge)
cv_ridge_f1 = f1_score(y_true = cv_dev_y, y_pred = cv_dev_preds_ridge)
print("Accuracy: ", cv_ridge_acc)
print("Precision: ", cv_ridge_pre)
print("Recall: ", cv_ridge_rec)
print("F1 Score: ", cv_ridge_f1)

Accuracy:  0.8860536735591729
Precision:  0.9053318824809575
Recall:  0.9514008004574043
F1 Score:  0.9277948146083077


# Evaluate best model on the test set

In [144]:
tf_test_preds_ridge = tf_grid_search_ridge.predict(tf_test_X)
tf_ridge_acc = accuracy_score(y_true = tf_test_y, y_pred = tf_test_preds_ridge)
tf_ridge_pre = precision_score(y_true = tf_test_y, y_pred = tf_test_preds_ridge)
tf_ridge_rec = recall_score(y_true = tf_test_y, y_pred = tf_test_preds_ridge)
tf_ridge_f1 = f1_score(y_true = tf_test_y, y_pred = tf_test_preds_ridge)
print("Accuracy: ", tf_ridge_acc)
print("Precision: ", tf_ridge_pre)
print("Recall: ", tf_ridge_rec)
print("F1 Score: ", tf_ridge_f1)

Accuracy:  0.9037783375314862
Precision:  0.9214285714285714
Recall:  0.9632856253889235
F1 Score:  0.9418923030118649
