In [30]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV

In [31]:
df = pd.read_excel("teacher_utterance_labels.xlsx")
df.sample()

Unnamed: 0,transcript_id,utterance_id,teacher_utterance_number,text,gold_standard
10547,32,3929,48,All right. I want you to go to the next page. ...,math instruction


In [32]:
df["math_instruction"] = np.where(df.gold_standard == "math instruction", 1, 0)
df.math_instruction.value_counts()

math_instruction
1    8564
0    2238
Name: count, dtype: int64

In [33]:
# Split into training, dev, and testing at the transcript level

TRAIN_RATIO = 0.6
DEV_RATIO = 0.2
TEST_RATIO = 0.2
np.random.seed(5643)

temp_df = df.copy()
temp_df = temp_df[["transcript_id", "utterance_id"]].groupby(["transcript_id"]).nunique()
temp_df['random_number'] = np.random.randint(1, 10001, size=len(temp_df))
temp_df = temp_df.sort_values(by = ["random_number"])


size = len(temp_df)
train_size = int(TRAIN_RATIO * size)
dev_size = int(DEV_RATIO * size)
test_size = size - train_size - dev_size

temp_df["training_split"] = ["train"]*train_size + ["dev"]*dev_size + ["test"]*test_size

df = df.merge(temp_df[["training_split"]], left_on = "transcript_id", right_index = True)
df = df.set_index("utterance_id")
df.sample(5)

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4505,37,75,We already know we have time we can say 10 and...,math instruction,1,test
3252,26,197,was thata good time to find thed rods and use ...,math instruction,1,train
9149,81,10,"It's not the 14th That's the 17th Okay, who ca...",math instruction,1,dev
3089,26,32,"Pencils behind you. Next to you, but not in yo...",classroom management,0,train
6804,58,117,But that wouldn't be a good strategy to write ...,math instruction,1,train


In [34]:
df.training_split.value_counts()

training_split
train    6544
dev      2273
test     1985
Name: count, dtype: int64

In [35]:
print(df[df.training_split == "train"].transcript_id.nunique())
print(df[df.training_split == "dev"].transcript_id.nunique())
print(df[df.training_split == "test"].transcript_id.nunique())

66
22
23


In [36]:
train_indices = list(df[df.training_split == "train"].index)
dev_indices = list(df[df.training_split == "dev"].index)
test_indices = list(df[df.training_split == "test"].index)

In [37]:
df

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9291,81,152,We're gonna put these aside. How many of you h...,math instruction,1,dev
9168,81,29,"Let's count with him,Ready? So 1 2 3 4 5 6, oh...",math instruction,1,dev
9186,81,47,1 2 3 4 It does. And we knew that because they...,classroom management,0,dev
9324,81,185,and I took it apart. I did. I need you to plea...,classroom management,0,dev
9156,81,17,Do we say 63 day or what? 63rd day. remember w...,math instruction,1,dev
...,...,...,...,...,...,...
10906,100,25,Cross your arms again. Put your hands together...,classroom management,0,dev
10923,100,47,how me people check out a (inaudible) yeah go ...,classroom management,0,dev
10888,100,3,Alex could you help him two keep that in mind...,math instruction,1,dev
10895,100,13,"Hayden. We think it's minus three, three plus ...",math instruction,1,dev


# Pre-Processing

#### Lematization and Removal of Domain Specific Stop Words

In [38]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data (if not already downloaded)
nltk.download('punkt') # For word_tokenize
nltk.download('wordnet') # For WordNetLemmatizer

# Custom stop words list
stop_words_classroom = [
    "a", "an", "the", # Articles
    "he", "she", "it", "they", # Pronouns
    "in", "on", "at", "from", # Prepositions
    "and", "but", "or", # Conjunctions
    "be", "have", "do", "is", "am", "are", "was", "were", # Auxiliary Verbs
    "say", "go", "get", "see", "know", "think", # Other Common Verbs
    "some", "any", "much", "many", # Quantifiers
    "always", "often", "sometimes", "never", # Adverbs of Frequency
    "can", "could", "may", "might", "will", "would", "should", # Modal Verbs
    "well", "so", "um", "uh", "like" # Filler Words
]

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenization and lemmatization
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    
    # Stop words removal
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words_classroom]
    
    return " ".join(filtered_tokens)

# Apply preprocessing to the text column
df['lematized_no_stop_text'] = df['text'].apply(preprocess_text)

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [39]:
df.head()

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split,lematized_no_stop_text
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9291,81,152,We're gonna put these aside. How many of you h...,math instruction,1,dev,we 're gon na put these aside . how of you all...
9168,81,29,"Let's count with him,Ready? So 1 2 3 4 5 6, oh...",math instruction,1,dev,"let 's count with him , ready ? 1 2 3 4 5 6 , ..."
9186,81,47,1 2 3 4 It does. And we knew that because they...,classroom management,0,dev,1 2 3 4 doe . we knew that because 're related...
9324,81,185,and I took it apart. I did. I need you to plea...,classroom management,0,dev,i took apart . i did . i need you to please to...
9156,81,17,Do we say 63 day or what? 63rd day. remember w...,math instruction,1,dev,we 63 day what ? 63rd day . remember we 're ta...


### TF-IDF after Lemmatization & Stop Word Removal

In [40]:
# Define TF-IDF Vectorizer
tfidf_vectorizer = TfidfVectorizer(min_df = 0.01)

# Fit and transform the processed text
tfidf_features = tfidf_vectorizer.fit_transform(df['lematized_no_stop_text'])
tfidf_matrix = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=df.index)

In [41]:
tfidf_matrix

Unnamed: 0_level_0,10,100,10s,11,12,13,14,15,16,17,...,wrote,yeah,yep,yes,yesterday,yet,you,your,yours,zero
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9291,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.351078,0.000000,0.0,0.0
9168,0.121956,0.0,0.0,0.228257,0.211335,0.241161,0.230335,0.235945,0.256515,0.267562,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
9186,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.165237,0.141072,0.0,0.0
9324,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.101656,0.173579,0.0,0.0
9156,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.047629,0.406637,0.0,0.0
10923,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.106725,0.0,0.0,0.0,0.0,0.129759,0.332347,0.0,0.0
10888,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.191142,0.000000,0.0,0.0
10895,0.000000,0.0,0.0,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,0.000000,...,0.0,0.000000,0.0,0.0,0.0,0.0,0.000000,0.000000,0.0,0.0


### Count Vectorizer after Lemmatization & Stop Word Removal

In [42]:
vec = CountVectorizer(min_df = 0.01)
X = vec.fit_transform(df['lematized_no_stop_text'])
cv_matrix = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out(), index=df.index)

In [43]:
cv_matrix

Unnamed: 0_level_0,10,100,10s,11,12,13,14,15,16,17,...,wrote,yeah,yep,yes,yesterday,yet,you,your,yours,zero
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,0,0,0
9168,1,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
9186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,2,0,0
9324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
9156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,5,0,0
10923,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,4,6,0,0
10888,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,7,0,0,0
10895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## **Because we are going to perform 5-Fold Cross Validation, we do not need a seperate holdout dev set. This dev set will be created from the train set during the cross validation process. Therefore, we will merge the train_idicies and dev_indices into a single set of indices for the cross validation process instead of just throwing it away.**

### X & y Training Split

In [44]:
tf_train_X = cv_matrix[cv_matrix.index.isin(train_indices)]
tf_train_y = df[df.index.isin(train_indices)].math_instruction

cv_train_X = cv_matrix[cv_matrix.index.isin(train_indices)]
cv_train_y = df[df.index.isin(train_indices)].math_instruction

### X & y Dev Split

In [45]:
tf_dev_X = cv_matrix[cv_matrix.index.isin(dev_indices)]
tf_dev_y = df[df.index.isin(dev_indices)].math_instruction

cv_dev_X = cv_matrix[cv_matrix.index.isin(dev_indices)]
cv_dev_y = df[df.index.isin(dev_indices)].math_instruction

### Combining Train & Dev Indices for Cross Validation

In [46]:
tf_train_dev_X = cv_matrix[cv_matrix.index.isin(train_indices + dev_indices)]
tf_train_dev_y = df[df.index.isin(train_indices + dev_indices)].math_instruction

cv_train_dev_X = cv_matrix[cv_matrix.index.isin(train_indices + dev_indices)]
cv_train_dev_y = df[df.index.isin(train_indices + dev_indices)].math_instruction

### X & y Testing Split

In [47]:
tf_test_X = cv_matrix[cv_matrix.index.isin(test_indices)]
tf_test_y = df[df.index.isin(test_indices)].math_instruction

cv_test_X = cv_matrix[cv_matrix.index.isin(test_indices)]
cv_test_y = df[df.index.isin(test_indices)].math_instruction

In [48]:
df_token_tf_train_dev = df.merge(tf_train_X, left_index = True, right_index=True)

df_token_cv_train_dev = df.merge(cv_train_X, left_index = True, right_index=True)

## Random Forest Classifier

In [49]:
# from sklearn.ensemble import RandomForestClassifier
# from sklearn.model_selection import cross_val_score

# # Define the Random Forest model
# random_forest_model = RandomForestClassifier(random_state=5643)

# # Define your features (X) and target variable (y)
# X = cv_matrix
# y = df['math_instruction']

# # Apply 5-fold cross-validation
# # Here 'cv=5' specifies the number of folds, and 'n_jobs=-1' uses all available CPUs for parallel computation
# cross_val_scores = cross_val_score(random_forest_model, X, y, cv=5, scoring='accuracy', n_jobs=-1)

# print("Cross-validation scores: ", cross_val_scores)
# print("Average cross-validation score: ", cross_val_scores.mean())

In [50]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression

### Logistic Regression with LASSO penalty

First we will run a logistic regression with a penalty on the L1 norm on the size of $\beta$. We implement 5-fold cross validation to pick the optimal $\lambda$ value.

In [51]:
logistic_model = LogisticRegression(penalty='l1', random_state = 5643, solver = 'saga')
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(logistic_model, param_grid, cv=StratifiedKFold(n_splits=5), scoring = "accuracy")
grid_search.fit(df_token_lemma_train, train_y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_params
#logistic_model.fit(train_X, train_y)
#test_predictions = logistic_model.predict(test_X)

NameError: name 'df_token_lemma_train' is not defined