In [18]:
import pandas as pd
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, recall_score, precision_score
from sklearn.model_selection import GridSearchCV

In [19]:
df = pd.read_excel("teacher_utterance_labels.xlsx")
df.sample()

Unnamed: 0,transcript_id,utterance_id,teacher_utterance_number,text,gold_standard
10547,32,3929,48,All right. I want you to go to the next page. ...,math instruction


In [20]:
df["math_instruction"] = np.where(df.gold_standard == "math instruction", 1, 0)
df.math_instruction.value_counts()

1    8564
0    2238
Name: math_instruction, dtype: int64

In [21]:
# Split into training, dev, and testing at the transcript level

TRAIN_RATIO = 0.6
DEV_RATIO = 0.2
TEST_RATIO = 0.2
np.random.seed(5643)

temp_df = df.copy()
temp_df = temp_df[["transcript_id", "utterance_id"]].groupby(["transcript_id"]).nunique()
temp_df['random_number'] = np.random.randint(1, 10001, size=len(temp_df))
temp_df = temp_df.sort_values(by = ["random_number"])


size = len(temp_df)
train_size = int(TRAIN_RATIO * size)
dev_size = int(DEV_RATIO * size)
test_size = size - train_size - dev_size

temp_df["training_split"] = ["train"]*train_size + ["dev"]*dev_size + ["test"]*test_size

df = df.merge(temp_df[["training_split"]], left_on = "transcript_id", right_index = True)
df = df.set_index("utterance_id")
df.sample(5)

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
4505,37,75,We already know we have time we can say 10 and...,math instruction,1,test
3252,26,197,was thata good time to find thed rods and use ...,math instruction,1,train
9149,81,10,"It's not the 14th That's the 17th Okay, who ca...",math instruction,1,dev
3089,26,32,"Pencils behind you. Next to you, but not in yo...",classroom management,0,train
6804,58,117,But that wouldn't be a good strategy to write ...,math instruction,1,train


In [22]:
df.training_split.value_counts()

train    6544
dev      2273
test     1985
Name: training_split, dtype: int64

In [23]:
print(df[df.training_split == "train"].transcript_id.nunique())
print(df[df.training_split == "dev"].transcript_id.nunique())
print(df[df.training_split == "test"].transcript_id.nunique())

66
22
23


In [24]:
train_indices = list(df[df.training_split == "train"].index)
dev_indices = list(df[df.training_split == "dev"].index)
test_indices = list(df[df.training_split == "test"].index)

In [25]:
df

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
9291,81,152,We're gonna put these aside. How many of you h...,math instruction,1,dev
9168,81,29,"Let's count with him,Ready? So 1 2 3 4 5 6, oh...",math instruction,1,dev
9186,81,47,1 2 3 4 It does. And we knew that because they...,classroom management,0,dev
9324,81,185,and I took it apart. I did. I need you to plea...,classroom management,0,dev
9156,81,17,Do we say 63 day or what? 63rd day. remember w...,math instruction,1,dev
...,...,...,...,...,...,...
10906,100,25,Cross your arms again. Put your hands together...,classroom management,0,dev
10923,100,47,how me people check out a (inaudible) yeah go ...,classroom management,0,dev
10888,100,3,Alex could you help him two keep that in mind...,math instruction,1,dev
10895,100,13,"Hayden. We think it's minus three, three plus ...",math instruction,1,dev


# Pre-Processing

#### Lematization and stop words

In [30]:
from sklearn.feature_extraction.text import TfidfVectorizer
from nltk.stem import WordNetLemmatizer
import nltk

# Download necessary NLTK data (if not already downloaded)
nltk.download('punkt') # For word_tokenize
nltk.download('wordnet') # For WordNetLemmatizer

# Custom stop words list
stop_words_classroom = [
    "a", "an", "the", # Articles
    "he", "she", "it", "they", # Pronouns
    "in", "on", "at", "from", # Prepositions
    "and", "but", "or", # Conjunctions
    "be", "have", "do", "is", "am", "are", "was", "were", # Auxiliary Verbs
    "say", "go", "get", "see", "know", "think", # Other Common Verbs
    "some", "any", "much", "many", # Quantifiers
    "always", "often", "sometimes", "never", # Adverbs of Frequency
    "can", "could", "may", "might", "will", "would", "should", # Modal Verbs
    "well", "so", "um", "uh", "like" # Filler Words
]

# Initialize lemmatizer
lemmatizer = WordNetLemmatizer()

def preprocess_text(text):
    # Tokenization and lemmatization
    tokens = nltk.word_tokenize(text)
    lemmatized_tokens = [lemmatizer.lemmatize(token.lower()) for token in tokens]
    
    # Stop words removal
    filtered_tokens = [token for token in lemmatized_tokens if token not in stop_words_classroom]
    
    return " ".join(filtered_tokens)

# Apply preprocessing to the text column
df['lematized_no_stop_text'] = df['text'].apply(preprocess_text)

# Define TF-IDF Vectorizer with N-Grams
#tfidf_vectorizer = TfidfVectorizer(ngram_range=(1, 2), stop_words=stop_words_classroom)  # Optionally include custom stop words here

# Fit and transform the processed text
#tfidf_features = tfidf_vectorizer.fit_transform(df['processed_text'])
#tfidf_matrix = pd.DataFrame(tfidf_features.toarray(), columns=vec.get_feature_names_out(), index=df.index)

# tfidf_features is now the matrix containing TF-IDF scores with n-grams considered

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\ofior\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\ofior\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [31]:
df

Unnamed: 0_level_0,transcript_id,teacher_utterance_number,text,gold_standard,math_instruction,training_split,lematized_text,lematized_no_stop_text
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
9291,81,152,We're gonna put these aside. How many of you h...,math instruction,1,dev,we 're gon na put these aside . how of you all...,we 're gon na put these aside . how of you all...
9168,81,29,"Let's count with him,Ready? So 1 2 3 4 5 6, oh...",math instruction,1,dev,"let 's count with him , ready ? 1 2 3 4 5 6 , ...","let 's count with him , ready ? 1 2 3 4 5 6 , ..."
9186,81,47,1 2 3 4 It does. And we knew that because they...,classroom management,0,dev,1 2 3 4 doe . we knew that because 're related...,1 2 3 4 doe . we knew that because 're related...
9324,81,185,and I took it apart. I did. I need you to plea...,classroom management,0,dev,i took apart . i did . i need you to please to...,i took apart . i did . i need you to please to...
9156,81,17,Do we say 63 day or what? 63rd day. remember w...,math instruction,1,dev,we 63 day what ? 63rd day . remember we 're ta...,we 63 day what ? 63rd day . remember we 're ta...
...,...,...,...,...,...,...,...,...
10906,100,25,Cross your arms again. Put your hands together...,classroom management,0,dev,cross your arm again . put your hand together ...,cross your arm again . put your hand together ...
10923,100,47,how me people check out a (inaudible) yeah go ...,classroom management,0,dev,how me people check out ( inaudible ) yeah ahe...,how me people check out ( inaudible ) yeah ahe...
10888,100,3,Alex could you help him two keep that in mind...,math instruction,1,dev,alex you help him two keep that mind i 'm gon ...,alex you help him two keep that mind i 'm gon ...
10895,100,13,"Hayden. We think it's minus three, three plus ...",math instruction,1,dev,"hayden . we 's minus three , three plus three ...","hayden . we 's minus three , three plus three ..."


### Tfidf based on Lemmetization and stop words

In [36]:
# Define TF-IDF Vectorizer with N-Grams
tfidf_vectorizer = TfidfVectorizer(min_df = 0.01)  # Optionally include custom stop words here

# Fit and transform the processed text
tfidf_features = tfidf_vectorizer.fit_transform(df['lematized_no_stop_text'])
tfidf_matrix = pd.DataFrame(tfidf_features.toarray(), columns=tfidf_vectorizer.get_feature_names_out(), index=df.index)
tfidf_matrix
# tfidf_features is now the matrix containing TF-IDF scores with n-grams considered

### Count Vectorizer on Lemmetization and stop words

In [38]:
vec = CountVectorizer(min_df = 0.01)
X = vec.fit_transform(df['lematized_no_stop_text'])
cv_matrix = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out(), index=df.index)
cv_matrix

Unnamed: 0_level_0,10,100,10s,11,12,13,14,15,16,17,...,wrote,yeah,yep,yes,yesterday,yet,you,your,yours,zero
utterance_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
9291,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,5,0,0,0
9168,1,0,0,1,1,1,1,1,1,1,...,0,0,0,0,0,0,0,0,0,0
9186,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,4,2,0,0
9324,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,1,0,0
9156,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
10906,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,1,5,0,0
10923,0,0,0,0,0,0,0,0,0,0,...,0,1,0,0,0,0,4,6,0,0
10888,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,7,0,0,0
10895,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [None]:
vec = CountVectorizer()
X = vec.fit_transform(df.preprocesstext)
matrix = pd.DataFrame(X.toarray(), columns=vec.get_feature_names_out(), index=df.index)

train_X = matrix[matrix.index.isin(train_indices)]
train_y = df[df.index.isin(train_indices)].math_instruction

In [10]:
df_token_lemma_train = df.merge(train_X, left_index = True, right_index= True)

In [15]:
print(len(train_y))
print(df_token_lemma_train.shape)

6544
(6544, 6210)


In [11]:
from sklearn.model_selection import GridSearchCV, StratifiedKFold
from sklearn.linear_model import LogisticRegression

### Logistic Regression with LASSO penalty

First we will run a logistic regression with a penalty on the L1 norm on the size of $\beta$. We implement 5-fold cross validation to pick the optimal $\lambda$ value.

In [16]:
logistic_model = LogisticRegression(penalty='l1', random_state = 5643, solver = 'saga')
param_grid = {'C': [0.001, 0.01, 0.1, 1, 10, 100]}
grid_search = GridSearchCV(logistic_model, param_grid, cv=StratifiedKFold(n_splits=5), scoring = "accuracy")
grid_search.fit(df_token_lemma_train, train_y)

best_params = grid_search.best_params_
best_score = grid_search.best_score_
best_params
#logistic_model.fit(train_X, train_y)
#test_predictions = logistic_model.predict(test_X)

ValueError: 
All the 30 fits failed.
It is very likely that your model is misconfigured.
You can try to debug the error by setting error_score='raise'.

Below are more details about the failures:
--------------------------------------------------------------------------------
6 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1207, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\utils\validation.py", line 1147, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\utils\validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\utils\_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: "Mirium, what happened? What did you do explain to everybody what you did? boys. So she came up with this number sentence. 16 minus seven equals nine and I see that she was she colored im those seven undernearth because that's what Thomas had. give a thumbs up or thumbs down on whether or not you had something that was very similar to Miriam's because I know I want you out."

--------------------------------------------------------------------------------
24 fits failed with the following error:
Traceback (most recent call last):
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\model_selection\_validation.py", line 732, in _fit_and_score
    estimator.fit(X_train, y_train, **fit_params)
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\base.py", line 1151, in wrapper
    return fit_method(estimator, *args, **kwargs)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\linear_model\_logistic.py", line 1207, in fit
    X, y = self._validate_data(
           ^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\base.py", line 621, in _validate_data
    X, y = check_X_y(X, y, **check_params)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\utils\validation.py", line 1147, in check_X_y
    X = check_array(
        ^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\utils\validation.py", line 917, in check_array
    array = _asarray_with_order(array, order=order, dtype=dtype, xp=xp)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\sklearn\utils\_array_api.py", line 380, in _asarray_with_order
    array = numpy.asarray(array, order=order, dtype=dtype)
            ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
  File "c:\Users\ofior\Documents\Conda3\Lib\site-packages\pandas\core\generic.py", line 2070, in __array__
    return np.asarray(self._values, dtype=dtype)
           ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
ValueError: could not convert string to float: "How many fewer children saw the movie in the afternoon than in the morning? Okay. All right. There's something special about this one. Mario, what is it? Is it a two part question."
