In [10]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV

In [11]:
df = pd.read_excel("teacher_utterance_labels.xlsx")
df.sample()

Unnamed: 0,transcript_id,utterance_id,teacher_utterance_number,text,gold_standard
10547,32,3929,48,All right. I want you to go to the next page. ...,math instruction


In [12]:
df["math_instruction"] = np.where(df.gold_standard == "math instruction", 1, 0)
df.math_instruction.value_counts()

math_instruction
1    8564
0    2238
Name: count, dtype: int64

In [13]:
# Split into training, dev, and testing at the transcript level

TRAIN_RATIO = 0.6
DEV_RATIO = 0.2
TEST_RATIO = 0.2
np.random.seed(5643)

temp_df = df.copy()
temp_df = temp_df[["transcript_id", "utterance_id"]].groupby(["transcript_id"]).nunique()
temp_df['random_number'] = np.random.randint(1, 10001, size=len(temp_df))
temp_df = temp_df.sort_values(by = ["random_number"])


size = len(temp_df)
train_size = int(TRAIN_RATIO * size)
dev_size = int(DEV_RATIO * size)
test_size = size - train_size - dev_size

temp_df["training_split"] = ["train"]*train_size + ["dev"]*dev_size + ["test"]*test_size

df = df.merge(temp_df[["training_split"]], left_on = "transcript_id", right_index = True)
df = df.set_index("utterance_id")
df.sample(5)

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4505,37,75,We already know we have time we can say 10 and...,math instruction,1,test
3252,26,197,was thata good time to find thed rods and use ...,math instruction,1,train
9149,81,10,"It's not the 14th That's the 17th Okay, who ca...",math instruction,1,dev
3089,26,32,"Pencils behind you. Next to you, but not in yo...",classroom management,0,train
6804,58,117,But that wouldn't be a good strategy to write ...,math instruction,1,train


In [14]:
df.training_split.value_counts()

training_split
train    6544
dev      2273
test     1985
Name: count, dtype: int64

In [15]:
print(df[df.training_split == "train"].transcript_id.nunique())
print(df[df.training_split == "dev"].transcript_id.nunique())
print(df[df.training_split == "test"].transcript_id.nunique())

66
22
23


In [16]:
train_indices = list(df[df.training_split == "train"].index)
dev_indices = list(df[df.training_split == "dev"].index)
test_indices = list(df[df.training_split == "test"].index)

In [17]:
# Note: this is just to illustrate how you can subset a feature matrix to only include rows in training. 
# You can do something similar for dev and testing

vec = CountVectorizer()
X = vec.fit_transform(df.text)
matrix = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out(), index=df.index)

train_X = matrix[matrix.index.isin(train_indices)]
train_y = df[df.index.isin(train_indices)].math_instruction

In [19]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data (if not already downloaded)
nltk.download('punkt') # For word_tokenize
nltk.download('wordnet') # For WordNetLemmatizer

# Custom stop words list
stop_words_classroom = [
    "a", "an", "the", # Articles
    "he", "she", "it", "they", # Pronouns
    "in", "on", "at", "from", # Prepositions
    "and", "but", "or", # Conjunctions
    "be", "have", "do", "is", "am", "are", "was", "were", # Auxiliary Verbs
    "say", "go", "get", "see", "know", "think", # Other Common Verbs
    "some", "any", "much", "many", # Quantifiers
    "always", "often", "sometimes", "never", # Adverbs of Frequency
    "can", "could", "may", "might", "will", "would", "should", # Modal Verbs
    "well", "so", "um", "uh", "like" # Filler Words
]

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenization and lemmatization
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    
    # Stop words removal
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words_classroom and token.isalpha()]
    
    return " ".join(filtered_tokens)

# Apply preprocessing to the text column
df['processed_text'] = df['text'].apply(preprocess_text)

# Define TF-IDF Vectorizer with N-Grams
tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words=stop_words_classroom)  # Optionally include custom stop words here

# Fit and transform the processed text
tfidf_features = tfidf_vectorizer.fit_transform(df['processed_text'])

# tfidf_features is now the matrix containing TF-IDF scores with n-grams considered

[nltk_data] Error loading punkt: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>
[nltk_data] Error loading wordnet: <urlopen error [SSL:
[nltk_data]     CERTIFICATE_VERIFY_FAILED] certificate verify failed:
[nltk_data]     unable to get local issuer certificate (_ssl.c:997)>


In [21]:
df.head()

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split,processed_text
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1
9291,81,152,We're gonna put these aside. How many of you h...,math instruction,1,dev,we gon na put these aside how of you all toget...
9168,81,29,"Let's count with him,Ready? So 1 2 3 4 5 6, oh...",math instruction,1,dev,let count with him ready oh i ran out of penny...
9186,81,47,1 2 3 4 It does. And we knew that because they...,classroom management,0,dev,doe we knew that because related we knew that ...
9324,81,185,and I took it apart. I did. I need you to plea...,classroom management,0,dev,i took apart i did i need you to please to you...
9156,81,17,Do we say 63 day or what? 63rd day. remember w...,math instruction,1,dev,we day what day remember we talking about orde...
