In [9]:
import re
import os
import tqdm
from glob import glob

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pprint import pprint
import graphviz

from collections import defaultdict
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from collections import Counter
import gensim
import nltk
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from gensim.scripts.glove2word2vec import glove2word2vec

from joblib import dump

In [3]:
data_path = "data/html/*.html"

code2convos = dict()

pbar = tqdm.tqdm(sorted(list(glob(data_path))))
for path in pbar:
    # print(Path.cwd() / path)
    file_code = os.path.basename(path).split(".")[0]
    with open(path, "r", encoding="latin1") as fh:
            
        # get the file id to use it as key later on
        fid = os.path.basename(path).split(".")[0]

        # read the html file
        html_page = fh.read()

        # parse the html file with bs4 so we can extract needed stuff
        soup = BeautifulSoup(html_page, "html.parser")

        # grab the conversations with the data-testid pattern
        data_test_id_pattern = re.compile(r"conversation-turn-[0-9]+")
        conversations = soup.find_all("div", attrs={"data-testid": data_test_id_pattern})

        convo_texts = []

        for i, convo in enumerate(conversations):
            convo = convo.find_all("div", attrs={"data-message-author-role":re.compile( r"[user|assistant]") })
            if len(convo) > 0:
                role = convo[0].get("data-message-author-role")
                convo_texts.append({
                        "role" : role,
                        "text" : convo[0].text
                    }
                )
                
        code2convos[file_code] = convo_texts

100%|██████████| 127/127 [00:24<00:00,  5.28it/s]


In [4]:
# let's see one of the conversations
pprint(code2convos["0031c86e-81f4-4eef-9e0e-28037abf9883"][0])

{'role': 'user',
 'text': 'Load a CSV file into a Pandas in Python. The file is named '
         "'cs412_hw1_dataset.csv' and contains columns like 'Species', "
         "'Island', 'Sex', 'Diet', 'Year', 'Life Stage', 'Body Mass (g)', "
         "'Bill Length (mm)', 'Bill Depth (mm)', 'Flipper Length (mm)', and "
         "'Health Metrics'. \n"}


#### Things to do:
- Prompt matching with questions
- Feature Engineering
- Question Grades preparation
- Train/Test split
- Fitting a model for predicting the scores

#### Prompt Matching
> We want to match the prompts with the questions in the Homework Let's
> do it with a simple term frequency vectorizing method. For each prompt,
> we will come with a vector that represents it. We will do the same
> thing with each of the homework questions. Then, we will calculate the
> vectors distanance to do the matching

In [5]:
prompts = []
code2prompts = defaultdict(list)
for code , convos in code2convos.items():
    user_prompts = []
    for conv in convos:
        if conv["role"] == "user":
            prompts.append(conv["text"])
            user_prompts.append(conv["text"])
    code2prompts[code] = user_prompts    

In [6]:
prompts[0]

"Load a CSV file into a Pandas in Python. The file is named 'cs412_hw1_dataset.csv' and contains columns like 'Species', 'Island', 'Sex', 'Diet', 'Year', 'Life Stage', 'Body Mass (g)', 'Bill Length (mm)', 'Bill Depth (mm)', 'Flipper Length (mm)', and 'Health Metrics'. \n"

In [7]:
questions = [
    """Initialize
*   First make a copy of the notebook given to you as a starter.
*   Make sure you choose Connect form upper right.
*   You may upload the data to the section on your left on Colab, than right click on the .csv file and get the path of the file by clicking on "Copy Path". You will be using it when loading the data.

""",
#####################
    """Load training dataset (5 pts)
    *  Read the .csv file with the pandas library
""",
#####################
"""Understanding the dataset & Preprocessing (15 pts)
Understanding the Dataset: (5 pts)
> - Find the shape of the dataset (number of samples & number of attributes). (Hint: You can use the **shape** function)
> - Display variable names (both dependent and independent).
> - Display the summary of the dataset. (Hint: You can use the **info** function)
> - Display the first 5 rows from training dataset. (Hint: You can use the **head** function)
Preprocessing: (10 pts)

> - Check if there are any missing values in the dataset. If there are, you can either drop these values or fill it with most common values in corresponding rows. **Be careful that you have enough data for training the  model.**

> - Encode categorical labels with the mappings given in the cell below. (Hint: You can use **map** function)
""",
#####################
"""Set X & y, split data (5 pts)

*   Shuffle the dataset.
*   Seperate your dependent variable X, and your independent variable y. The column health_metrics is y, the rest is X.
*   Split training and test sets as 80% and 20%, respectively.
""",
#####################
"""Features and Correlations (10 pts)

* Correlations of features with health (4 points)
Calculate the correlations for all features in dataset. Highlight any strong correlations with the target variable. Plot your results in a heatmap.

* Feature Selection (3 points)
Select a subset of features that are likely strong predictors, justifying your choices based on the computed correlations.

* Hypothetical Driver Features (3 points)
Propose two hypothetical features that could enhance the model's predictive accuracy for Y, explaining how they might be derived and their expected impact. Show the resulting correlations with target variable.

* __Note:__ You get can get help from GPT.
""",
#####################
"""Tune Hyperparameters (20 pts)
* Choose 2 hyperparameters to tune. You can use the Scikit learn decision tree documentation for the available hyperparameters *(Hyperparameters are listed under "Parameters" in the documentation)*. Use GridSearchCV for hyperparameter tuning, with a cross-validation value of 5. Use validation accuracy to pick the best hyper-parameter values. (15 pts)
-Explain the hyperparameters you chose to tune. *(What are the hyperparameters you chose? Why did you choose them?)* (5 pts)
""",
#####################
"""Re-train and plot the decision tree with the hyperparameters you have chosen (15 pts)
- Re-train model with the hyperparameters you have chosen in part 5). (10 pts)
- Plot the tree you have trained. (5 pts)
Hint: You can import the **plot_tree** function from the sklearn library.
""",
#####################
"""Test your classifier on the test set (20 pts)
- Predict the labels of testing data using the tree you have trained in step 6. (10 pts)
- Report the classification accuracy. (2 pts)
- Plot & investigate the confusion matrix. Fill the following blanks. (8 pts)
> The model most frequently mistakes class(es) _________ for class(es) _________.
Hint: You can use the confusion_matrix function from sklearn.metrics
""",
#####################
"""Find the information gain on the first split (10 pts)""",
#####################
]

In [8]:
"""
glove_input_file = 'glove/glove.6B.100d.txt'
word2vec_output_file = 'glove/glove.6B.100d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

glove_input_file = 'glove/glove.6B.50d.txt'
word2vec_output_file = 'glove/glove.6B.50d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

glove_input_file = 'glove/glove.6B.200d.txt'
word2vec_output_file = 'glove/glove.6B.200d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)
"""

"\nglove_input_file = 'glove/glove.6B.100d.txt'\nword2vec_output_file = 'glove/glove.6B.100d.word2vec.txt'\nglove2word2vec(glove_input_file, word2vec_output_file)\n\nglove_input_file = 'glove/glove.6B.50d.txt'\nword2vec_output_file = 'glove/glove.6B.50d.word2vec.txt'\nglove2word2vec(glove_input_file, word2vec_output_file)\n\nglove_input_file = 'glove/glove.6B.200d.txt'\nword2vec_output_file = 'glove/glove.6B.200d.word2vec.txt'\nglove2word2vec(glove_input_file, word2vec_output_file)\n"

In [11]:
glove_model = KeyedVectors.load_word2vec_format('glove/glove.6B.50d.word2vec.txt', binary=False)
dump(glove_model, 'joblib_models/glove_model_50d.joblib')
def preprocess_and_tokenize(text):
    words = word_tokenize(text.lower())
    return [word for word in words if word.isalpha()]


In [None]:
def vectorize_prompts(prompts, model):
    vectorized = []
    for prompt in prompts:
        words = preprocess_and_tokenize(prompt)
        word_vectors = [model[word] for word in words if word in model.key_to_index]

        if len(word_vectors) == 0:
            vectorized.append(np.zeros(model.vector_size))  
        else:
            vectorized.append(np.mean(word_vectors, axis=0)) 

    return pd.DataFrame(vectorized)

code2prompts_glove = dict()

for code, user_prompts in code2prompts.items():
    if len(user_prompts) > 0:
        vectorized_df = vectorize_prompts(user_prompts, glove_model)
        code2prompts_glove[code] = vectorized_df
    else:
        print(f"{code}.html has no prompts")
        
print(code2prompts_glove["089eb66d-4c3a-4f58-b98f-a3774a2efb34"].shape)

In [None]:
code2prompts_glove["089eb66d-4c3a-4f58-b98f-a3774a2efb34"].head()

In [None]:
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(prompts + questions)
questions_TF_IDF = pd.DataFrame(vectorizer.transform(questions).toarray(), columns=vectorizer.get_feature_names_out())
questions_TF_IDF.head()

In [None]:
code2prompts_tf_idf = dict()
for code, user_prompts in code2prompts.items():
    if len(user_prompts) == 0:
        print(code+".html")
        continue
    prompts_TF_IDF = pd.DataFrame(vectorizer.transform(user_prompts).toarray(), columns=vectorizer.get_feature_names_out())
    code2prompts_tf_idf[code] = prompts_TF_IDF

In [None]:
code2prompts_tf_idf["089eb66d-4c3a-4f58-b98f-a3774a2efb34"].head()

In [None]:
code2cosine = dict()
for code, user_prompts_tf_idf in code2prompts_tf_idf.items():
    code2cosine[code] = pd.DataFrame(cosine_similarity(questions_TF_IDF,user_prompts_tf_idf))


In [None]:
code2cosine["089eb66d-4c3a-4f58-b98f-a3774a2efb34"]

In [None]:
question_of_prompts = dict()

for code, df in code2cosine.items():
    max_indices = []

    for col in df.columns:
        max_index = df[col].idxmax()

        max_indices.append(max_index)

    question_of_prompts[code] = max_indices
    
question_of_prompts["089eb66d-4c3a-4f58-b98f-a3774a2efb34"]

In [None]:
code2prompts_glove["089eb66d-4c3a-4f58-b98f-a3774a2efb34"].shape

In [None]:
num_questions = len(questions)

new_code_counts = dict()

for code, indices in question_of_prompts.items():
    counts = Counter(indices)

    count_vector = [counts.get(i, 0) for i in range(num_questions)]

    new_code_counts[code] = count_vector

new_code_counts["04f91058-d0f8-4324-83b2-19c671f433dc"]

In [None]:
vector_size = 50   

final_df = pd.DataFrame(index=code2prompts_glove.keys(), columns=[f'q{i}_feature_{j}' for i in range(num_questions) for j in range(vector_size)])

for code in code2prompts_glove.keys():
    glove_vectors = code2prompts_glove[code]
    question_indices = question_of_prompts[code]

    summed_vectors = np.zeros((num_questions, vector_size))
    prompt_counts = Counter(question_indices)

    for vector, question_idx in zip(glove_vectors, question_indices):
        scaled_vector = vector / prompt_counts[question_idx]
        summed_vectors[question_idx] += scaled_vector

    for i in range(num_questions):
        column_labels = [f'q{i}_feature_{j}' for j in range(vector_size)]
        final_df.loc[code, column_labels] = summed_vectors[i]

final_df.rename_axis('code', inplace=True)
final_df.head()

In [None]:
scaler = StandardScaler()

normalized_data = scaler.fit_transform(final_df)

normalized_df = pd.DataFrame(normalized_data, index=final_df.index, columns=final_df.columns)

normalized_df.rename_axis('code', inplace=True)

# Feature Engineering
- Number of prompts that a user asked
- Number of complaints that a user makes e.g "the code gives this error!"
- User prompts average number of characters

In [None]:
coefficients = [0.1,0.05,0.15,0.05,0.1,0.2,0.15,0.1,0.1]

modified_code_counts = {}

for code, count_vector in new_code_counts.items():
    modified_vector = [count * coeff for count, coeff in zip(count_vector, coefficients)]
    modified_code_counts[code] = modified_vector

modified_counts_df = pd.DataFrame.from_dict(modified_code_counts, orient='index')

modified_counts_df.index.name = 'code'

extended_df = normalized_df.merge(modified_counts_df, left_on='code', right_index=True)

modified_count_col_names = [f'modified_count_{i}' for i in range(num_questions)]
extended_df.rename(columns=dict(zip(range(num_questions), modified_count_col_names)), inplace=True)

extended_df.head()

In [None]:
code2questionmapping = dict()
for code, cosine_scores in code2cosine.items():
    code2questionmapping[code] = code2cosine[code].max(axis=1).tolist()

question_mapping_scores = pd.DataFrame(code2questionmapping).T
question_mapping_scores.reset_index(inplace=True)
question_mapping_scores.rename(columns={i: f"Q_{i}" for i in range(len(questions))}, inplace=True)
question_mapping_scores.rename(columns={"index" : "code"}, inplace=True)

question_mapping_scores.head()

In [None]:
merged_df = question_mapping_scores.merge(extended_df, on='code', how='left')
# merged_df = question_mapping_scores.merge(modified_counts_df, on='code', how='left')
merged_df.shape

In [None]:
# reading the scores
scores = pd.read_csv("data/scores.csv", sep=",")
scores["code"] = scores["code"].apply(lambda x: x.strip())

# selecting the columns we need and we care
scores = scores[["code", "grade"]]

# show some examples
scores.head()

In [None]:
# Let's check grades distribution

plt.title('Histogram Grades')
plt.hist(scores["grade"], rwidth=.8, bins=np.arange(min(scores["grade"]), max(scores["grade"])+2) - 0.5)
plt.ylabel('Count')
plt.show()

#### Merging scores with features

In [438]:
from joblib import dump

temp_df = pd.merge(merged_df, scores, on='code', how="left")
temp_df.dropna(inplace=True)
temp_df.drop_duplicates("code",inplace=True, keep="first")

temp_df = temp_df[temp_df['grade'] >= 70]

temp_df['grade'] = temp_df['grade'] - 70

scaler = StandardScaler()

temp_df['grade'] = scaler.fit_transform(temp_df[['grade']])
dump(scaler, 'scaler.joblib')
temp_df.head()


Unnamed: 0,code,Q_0,Q_1,Q_2,Q_3,Q_4,Q_5,Q_6,Q_7,Q_8,...,modified_count_0,modified_count_1,modified_count_2,modified_count_3,modified_count_4,modified_count_5,modified_count_6,modified_count_7,modified_count_8,grade
1,0225686d-b825-4cac-8691-3a3a5343df2b,0.192013,0.795607,0.772184,0.882656,0.607114,0.987511,0.892586,0.570741,0.543866,...,0.0,0.05,0.75,0.05,0.4,0.4,0.15,0.3,0.1,0.816992
2,041f950b-c013-409a-a642-cffff60b9d4b,0.258306,0.295923,0.624824,0.351872,0.643038,0.454314,0.540269,0.546506,0.325793,...,0.0,0.0,0.3,0.15,0.1,0.0,0.3,0.1,0.0,-0.553696
3,04f91058-d0f8-4324-83b2-19c671f433dc,0.145965,0.117841,0.267346,0.316809,0.333889,0.309084,0.192434,0.261892,0.407106,...,0.0,0.0,0.9,0.15,0.6,0.2,0.15,0.2,0.1,0.512395
4,089eb66d-4c3a-4f58-b98f-a3774a2efb34,0.344182,0.575528,0.782883,0.624833,0.724872,0.872171,0.684797,0.945305,0.511769,...,0.5,0.05,2.85,0.75,1.7,1.8,1.05,0.4,0.9,0.969291
5,090d6217-5d69-4929-a342-19abab78324f,0.181981,0.716248,0.654161,0.710065,0.678797,0.872171,0.794699,0.652518,0.810325,...,0.0,0.25,2.25,0.65,1.0,1.4,0.9,0.5,1.2,-1.467488


In [None]:
X = temp_df[temp_df.columns[1:-1]].to_numpy()
y = temp_df["grade"].to_numpy()
print(X.shape, y.shape)

#### Train/Test split

In [435]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
print("Train set size:", len(X_train))
print("Test set size:", len(X_test))

Train set size: 94
Test set size: 24


(94, 468)

#### Fitting a model

#### Predicting and Analyzing 

In [425]:
from joblib import dump

regressor = DecisionTreeRegressor(random_state=0,criterion='squared_error', max_depth=5)
regressor.fit(X_train, y_train)

dump(regressor, '412_model.joblib')

['412_model.joblib']

In [426]:
extracted_MSEs = regressor.tree_.impurity   
for idx, MSE in enumerate(regressor.tree_.impurity):
    print("Node {} has MSE {}".format(idx,MSE))

Node 0 has MSE 0.9058181813259566
Node 1 has MSE 1.2586574827217656
Node 2 has MSE 1.3932309919328423
Node 3 has MSE 0.5860256410474337
Node 4 has MSE 0.005798718984516116
Node 5 has MSE 0.0
Node 6 has MSE -1.1102230246251565e-16
Node 7 has MSE 0.11504658465280149
Node 8 has MSE 0.0
Node 9 has MSE 0.015946477207419762
Node 10 has MSE 0.939155793084093
Node 11 has MSE 0.2996004808666708
Node 12 has MSE 0.08553110502161232
Node 13 has MSE 0.023194875938067128
Node 14 has MSE 1.984089975648473e-15
Node 15 has MSE 0.6452159118283847
Node 16 has MSE 0.44323958237895744
Node 17 has MSE 0.10824275437763592
Node 18 has MSE 0.0057987189845162546
Node 19 has MSE -2.220446049250313e-16
Node 20 has MSE 0.3121014667642568
Node 21 has MSE 0.07731625312688267
Node 22 has MSE 0.32472826313290726
Node 23 has MSE 0.036080918125879435
Node 24 has MSE 0.005798718984516116
Node 25 has MSE 0.0
Node 26 has MSE -4.440892098500626e-16
Node 27 has MSE -4.440892098500626e-16
Node 28 has MSE 0.43532312711924115
N

In [428]:
# Prediction
y_train_pred = regressor.predict(X_train)
y_test_pred = regressor.predict(X_test)

# Calculation of Mean Squared Error (MSE)
print("MSE Train:", mean_squared_error(y_train,y_train_pred))
print("MSE TEST:", mean_squared_error(y_test,y_test_pred))

print("R2 Train:", r2_score(y_train,y_train_pred))
print("R2 TEST:", r2_score(y_test,y_test_pred))


MSE Train: 0.12114147881199924
MSE TEST: 2.0223849512949665
R2 Train: 0.8662629197454729
R2 TEST: -0.510209863091617


In [429]:
from sklearn.linear_model import Lasso

lasso_regressor = Lasso(alpha=0.01, random_state=0)  
lasso_regressor.fit(X_train, y_train)

y_train_pred_lasso = lasso_regressor.predict(X_train)
y_test_pred_lasso = lasso_regressor.predict(X_test)

print("\nLasso Regression:")
print("MSE Train:", mean_squared_error(y_train, y_train_pred_lasso))
print("MSE Test:", mean_squared_error(y_test, y_test_pred_lasso))
print("R2 Train:", r2_score(y_train, y_train_pred_lasso))
print("R2 Test:", r2_score(y_test, y_test_pred_lasso))



Lasso Regression:
MSE Train: 0.6189626326236027
MSE Test: 1.9058543100748302
R2 Train: 0.3166811559053144
R2 Test: -0.42319095820392305


In [430]:
from sklearn.decomposition import PCA
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LinearRegression

pca = PCA(n_components=0.95)  # Keep 95% of variance
linear_reg = LinearRegression()
pcr = make_pipeline(pca, linear_reg)

pcr.fit(X_train, y_train)

y_train_pred_pcr = pcr.predict(X_train)
y_test_pred_pcr = pcr.predict(X_test)

print("\nPrincipal Component Regression:")
print("MSE Train:", mean_squared_error(y_train, y_train_pred_pcr))
print("MSE Test:", mean_squared_error(y_test, y_test_pred_pcr))
print("R2 Train:", r2_score(y_train, y_train_pred_pcr))
print("R2 Test:", r2_score(y_test, y_test_pred_pcr))



Principal Component Regression:
MSE Train: 0.7723967766238682
MSE Test: 1.6259881278969333
R2 Train: 0.14729380294264227
R2 Test: -0.21420173070783166


In [433]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error, r2_score

random_forest_regressor = RandomForestRegressor(random_state=42, n_estimators=40)  # n_estimators can be adjusted

random_forest_regressor.fit(X_train, y_train)
dump(random_forest_regressor, '412_rf_model.joblib')

y_train_pred_rf = random_forest_regressor.predict(X_train)
y_test_pred_rf = random_forest_regressor.predict(X_test)

print("Random Forest Regressor:")
print("MSE Train:", mean_squared_error(y_train, y_train_pred_rf))
print("MSE Test:", mean_squared_error(y_test, y_test_pred_rf))

print("R2 Train:", r2_score(y_train, y_train_pred_rf))
print("R2 Test:", r2_score(y_test, y_test_pred_rf))


Random Forest Regressor:
MSE Train: 0.1284448641532753
MSE Test: 1.427212730230189
R2 Train: 0.8582001699664993
R2 Test: -0.06576680198466378


In [432]:
"""
In this cell, almost all combinations have been tried.
PCA+RF, RF+Lasso, Lasso+RF, RF + Average, PCR + Average and etc.
"""

X_train_ensemble = np.column_stack((y_train_pred_rf, y_train_pred_lasso)) 
X_test_ensemble = np.column_stack((y_test_pred_rf, y_test_pred_lasso))

ensemble_regressor = DecisionTreeRegressor(random_state=42)
ensemble_regressor.fit(X_train_ensemble, y_train)

y_train_pred_ensemble = ensemble_regressor.predict(X_train_ensemble)
y_test_pred_ensemble = ensemble_regressor.predict(X_test_ensemble)

print("Ensemble Model with Decision Tree Regressor:")
print("MSE Train:", mean_squared_error(y_train, y_train_pred_ensemble))
print("MSE Test:", mean_squared_error(y_test, y_test_pred_ensemble))
print("R2 Train:", r2_score(y_train, y_train_pred_ensemble))
print("R2 Test:", r2_score(y_test, y_test_pred_ensemble))

Ensemble Model with Decision Tree Regressor:
MSE Train: 1.5981120815128126e-33
MSE Test: 1.524096639763676
R2 Train: 1.0
R2 Test: -0.13811456923770837
