In [1]:
import re
import os
import tqdm
from glob import glob

import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from pprint import pprint
import graphviz

from collections import defaultdict
from bs4 import BeautifulSoup

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn import tree
from sklearn.tree import DecisionTreeRegressor
from sklearn.model_selection import train_test_split 
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.preprocessing import normalize
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

from collections import Counter
import gensim
import nltk
from gensim.models import KeyedVectors
from gensim.models import Word2Vec
from nltk.tokenize import word_tokenize
from gensim.scripts.glove2word2vec import glove2word2vec


In [2]:
data_path = "test/html/*.html"

code2convos = dict()

pbar = tqdm.tqdm(sorted(list(glob(data_path))))
for path in pbar:
    # print(Path.cwd() / path)
    file_code = os.path.basename(path).split(".")[0]
    with open(path, "r", encoding="latin1") as fh:
            
        # get the file id to use it as key later on
        fid = os.path.basename(path).split(".")[0]

        # read the html file
        html_page = fh.read()

        # parse the html file with bs4 so we can extract needed stuff
        soup = BeautifulSoup(html_page, "html.parser")

        # grab the conversations with the data-testid pattern
        data_test_id_pattern = re.compile(r"conversation-turn-[0-9]+")
        conversations = soup.find_all("div", attrs={"data-testid": data_test_id_pattern})

        convo_texts = []

        for i, convo in enumerate(conversations):
            convo = convo.find_all("div", attrs={"data-message-author-role":re.compile( r"[user|assistant]") })
            if len(convo) > 0:
                role = convo[0].get("data-message-author-role")
                convo_texts.append({
                        "role" : role,
                        "text" : convo[0].text
                    }
                )
                
        code2convos[file_code] = convo_texts

100%|██████████| 188/188 [00:41<00:00,  4.51it/s]


#### Things to do:
- Prompt matching with questions
- Feature Engineering
- Question Grades preparation
- Train/Test split
- Fitting a model for predicting the scores

#### Prompt Matching
> We want to match the prompts with the questions in the Homework Let's
> do it with a simple term frequency vectorizing method. For each prompt,
> we will come with a vector that represents it. We will do the same
> thing with each of the homework questions. Then, we will calculate the
> vectors distanance to do the matching

In [5]:
prompts = []
code2prompts = defaultdict(list)
for code , convos in code2convos.items():
    user_prompts = []
    for conv in convos:
        if conv["role"] == "user":
            prompts.append(conv["text"])
            user_prompts.append(conv["text"])
    code2prompts[code] = user_prompts    

In [6]:
prompts[0]

"# Hypothetical Feature 1: Daily Fish Consumption\nX_train['Daily Fish Consumption'] = X_train['body_mass_g'] / X_train.groupby('diet')['body_mass_g'].transform('mean')\n\n# Hypothetical Feature 2: Activity Index\nX_train['Activity Index'] = X_train['flipper_length_mm'] / X_train['body_mass_g']\n\n# Display the correlations with the target variable\ncorrelation_feature1 = X_train['Daily Fish Consumption'].corr(y_train)\ncorrelation_feature2 = X_train['Activity Index'].corr(y_train)\n\n(correlation_feature1, correlation_feature2)\n THÄ°S Ä°S THE CODE"

In [7]:
questions = [
    """Initialize
*   First make a copy of the notebook given to you as a starter.
*   Make sure you choose Connect form upper right.
*   You may upload the data to the section on your left on Colab, than right click on the .csv file and get the path of the file by clicking on "Copy Path". You will be using it when loading the data.

""",
#####################
    """Load training dataset (5 pts)
    *  Read the .csv file with the pandas library
""",
#####################
"""Understanding the dataset & Preprocessing (15 pts)
Understanding the Dataset: (5 pts)
> - Find the shape of the dataset (number of samples & number of attributes). (Hint: You can use the **shape** function)
> - Display variable names (both dependent and independent).
> - Display the summary of the dataset. (Hint: You can use the **info** function)
> - Display the first 5 rows from training dataset. (Hint: You can use the **head** function)
Preprocessing: (10 pts)

> - Check if there are any missing values in the dataset. If there are, you can either drop these values or fill it with most common values in corresponding rows. **Be careful that you have enough data for training the  model.**

> - Encode categorical labels with the mappings given in the cell below. (Hint: You can use **map** function)
""",
#####################
"""Set X & y, split data (5 pts)

*   Shuffle the dataset.
*   Seperate your dependent variable X, and your independent variable y. The column health_metrics is y, the rest is X.
*   Split training and test sets as 80% and 20%, respectively.
""",
#####################
"""Features and Correlations (10 pts)

* Correlations of features with health (4 points)
Calculate the correlations for all features in dataset. Highlight any strong correlations with the target variable. Plot your results in a heatmap.

* Feature Selection (3 points)
Select a subset of features that are likely strong predictors, justifying your choices based on the computed correlations.

* Hypothetical Driver Features (3 points)
Propose two hypothetical features that could enhance the model's predictive accuracy for Y, explaining how they might be derived and their expected impact. Show the resulting correlations with target variable.

* __Note:__ You get can get help from GPT.
""",
#####################
"""Tune Hyperparameters (20 pts)
* Choose 2 hyperparameters to tune. You can use the Scikit learn decision tree documentation for the available hyperparameters *(Hyperparameters are listed under "Parameters" in the documentation)*. Use GridSearchCV for hyperparameter tuning, with a cross-validation value of 5. Use validation accuracy to pick the best hyper-parameter values. (15 pts)
-Explain the hyperparameters you chose to tune. *(What are the hyperparameters you chose? Why did you choose them?)* (5 pts)
""",
#####################
"""Re-train and plot the decision tree with the hyperparameters you have chosen (15 pts)
- Re-train model with the hyperparameters you have chosen in part 5). (10 pts)
- Plot the tree you have trained. (5 pts)
Hint: You can import the **plot_tree** function from the sklearn library.
""",
#####################
"""Test your classifier on the test set (20 pts)
- Predict the labels of testing data using the tree you have trained in step 6. (10 pts)
- Report the classification accuracy. (2 pts)
- Plot & investigate the confusion matrix. Fill the following blanks. (8 pts)
> The model most frequently mistakes class(es) _________ for class(es) _________.
Hint: You can use the confusion_matrix function from sklearn.metrics
""",
#####################
"""Find the information gain on the first split (10 pts)""",
#####################
]

In [8]:
"""
glove_input_file = 'glove/glove.6B.100d.txt'
word2vec_output_file = 'glove/glove.6B.100d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

glove_input_file = 'glove/glove.6B.50d.txt'
word2vec_output_file = 'glove/glove.6B.50d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)

glove_input_file = 'glove/glove.6B.200d.txt'
word2vec_output_file = 'glove/glove.6B.200d.word2vec.txt'
glove2word2vec(glove_input_file, word2vec_output_file)
"""

"\nglove_input_file = 'glove/glove.6B.100d.txt'\nword2vec_output_file = 'glove/glove.6B.100d.word2vec.txt'\nglove2word2vec(glove_input_file, word2vec_output_file)\n\nglove_input_file = 'glove/glove.6B.50d.txt'\nword2vec_output_file = 'glove/glove.6B.50d.word2vec.txt'\nglove2word2vec(glove_input_file, word2vec_output_file)\n\nglove_input_file = 'glove/glove.6B.200d.txt'\nword2vec_output_file = 'glove/glove.6B.200d.word2vec.txt'\nglove2word2vec(glove_input_file, word2vec_output_file)\n"

In [9]:
glove_model = KeyedVectors.load_word2vec_format('glove/glove.6B.50d.word2vec.txt', binary=False)
def preprocess_and_tokenize(text):
    words = word_tokenize(text.lower())
    return [word for word in words if word.isalpha()]


In [11]:
def vectorize_prompts(prompts, model):
    vectorized = []
    for prompt in prompts:
        words = preprocess_and_tokenize(prompt)
        word_vectors = [model[word] for word in words if word in model.key_to_index]

        if len(word_vectors) == 0:
            vectorized.append(np.zeros(model.vector_size))  
        else:
            vectorized.append(np.mean(word_vectors, axis=0)) 

    return pd.DataFrame(vectorized)

code2prompts_glove = dict()

for code, user_prompts in code2prompts.items():
    if len(user_prompts) > 0:
        vectorized_df = vectorize_prompts(user_prompts, glove_model)
        code2prompts_glove[code] = vectorized_df
    else:
        print(f"{code}.html has no prompts")
        

In [13]:
vectorizer = TfidfVectorizer()
vectorizer = vectorizer.fit(prompts + questions)
questions_TF_IDF = pd.DataFrame(vectorizer.transform(questions).toarray(), columns=vectorizer.get_feature_names_out())


In [14]:
code2prompts_tf_idf = dict()
for code, user_prompts in code2prompts.items():
    if len(user_prompts) == 0:
        print(code+".html")
        continue
    prompts_TF_IDF = pd.DataFrame(vectorizer.transform(user_prompts).toarray(), columns=vectorizer.get_feature_names_out())
    code2prompts_tf_idf[code] = prompts_TF_IDF

In [15]:
code2cosine = dict()
for code, user_prompts_tf_idf in code2prompts_tf_idf.items():
    code2cosine[code] = pd.DataFrame(cosine_similarity(questions_TF_IDF,user_prompts_tf_idf))


In [16]:
question_of_prompts = dict()

for code, df in code2cosine.items():
    max_indices = []

    for col in df.columns:
        max_index = df[col].idxmax()

        max_indices.append(max_index)

    question_of_prompts[code] = max_indices
    

In [17]:
num_questions = len(questions)

new_code_counts = dict()

for code, indices in question_of_prompts.items():
    counts = Counter(indices)

    count_vector = [counts.get(i, 0) for i in range(num_questions)]

    new_code_counts[code] = count_vector


In [18]:
vector_size = 50   

final_df = pd.DataFrame(index=code2prompts_glove.keys(), columns=[f'q{i}_feature_{j}' for i in range(num_questions) for j in range(vector_size)])

for code in code2prompts_glove.keys():
    glove_vectors = code2prompts_glove[code]
    question_indices = question_of_prompts[code]

    summed_vectors = np.zeros((num_questions, vector_size))
    prompt_counts = Counter(question_indices)

    for vector, question_idx in zip(glove_vectors, question_indices):
        scaled_vector = vector / prompt_counts[question_idx]
        summed_vectors[question_idx] += scaled_vector

    for i in range(num_questions):
        column_labels = [f'q{i}_feature_{j}' for j in range(vector_size)]
        final_df.loc[code, column_labels] = summed_vectors[i]

final_df.rename_axis('code', inplace=True)
final_df.head()

Unnamed: 0_level_0,q0_feature_0,q0_feature_1,q0_feature_2,q0_feature_3,q0_feature_4,q0_feature_5,q0_feature_6,q0_feature_7,q0_feature_8,q0_feature_9,...,q8_feature_40,q8_feature_41,q8_feature_42,q8_feature_43,q8_feature_44,q8_feature_45,q8_feature_46,q8_feature_47,q8_feature_48,q8_feature_49
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
00941713-c3a2-4d27-81dc-cd447ace4a47,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
00aea02f-a95a-4c04-8be3-777461732cdf,23.0,23.0,23.0,23.0,23.0,23.0,23.0,23.0,23.0,23.0,...,19.5,19.5,19.5,19.5,19.5,19.5,19.5,19.5,19.5,19.5
04fdb619-d902-4e98-a5e9-a8198bfe047c,19.5,19.5,19.5,19.5,19.5,19.5,19.5,19.5,19.5,19.5,...,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0,29.0
05029661-f8d8-441b-9cab-3c79f28a8b26,11.666667,11.666667,11.666667,11.666667,11.666667,11.666667,11.666667,11.666667,11.666667,11.666667,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
059a146e-a37c-498f-8c0b-5a78204249cb,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0,19.0


In [19]:
scaler = StandardScaler()

normalized_data = scaler.fit_transform(final_df)

normalized_df = pd.DataFrame(normalized_data, index=final_df.index, columns=final_df.columns)

normalized_df.rename_axis('code', inplace=True)

# Feature Engineering
- Number of prompts that a user asked
- Number of complaints that a user makes e.g "the code gives this error!"
- User prompts average number of characters

In [20]:
coefficients = [0.1,0.05,0.15,0.05,0.1,0.2,0.15,0.1,0.1]

modified_code_counts = {}

for code, count_vector in new_code_counts.items():
    modified_vector = [count * coeff for count, coeff in zip(count_vector, coefficients)]
    modified_code_counts[code] = modified_vector

modified_counts_df = pd.DataFrame.from_dict(modified_code_counts, orient='index')

modified_counts_df.index.name = 'code'

extended_df = normalized_df.merge(modified_counts_df, left_on='code', right_index=True)

modified_count_col_names = [f'modified_count_{i}' for i in range(num_questions)]
extended_df.rename(columns=dict(zip(range(num_questions), modified_count_col_names)), inplace=True)


In [21]:
code2questionmapping = dict()
for code, cosine_scores in code2cosine.items():
    code2questionmapping[code] = code2cosine[code].max(axis=1).tolist()

question_mapping_scores = pd.DataFrame(code2questionmapping).T
question_mapping_scores.reset_index(inplace=True)
question_mapping_scores.rename(columns={i: f"Q_{i}" for i in range(len(questions))}, inplace=True)
question_mapping_scores.rename(columns={"index" : "code"}, inplace=True)


In [22]:
merged_df = question_mapping_scores.merge(extended_df, on='code', how='left')
# merged_df = question_mapping_scores.merge(modified_counts_df, on='code', how='left')
merged_df.shape

(188, 469)

In [28]:
merged_df.head()

Unnamed: 0,code,Q_0,Q_1,Q_2,Q_3,Q_4,Q_5,Q_6,Q_7,Q_8,...,q8_feature_49,modified_count_0,modified_count_1,modified_count_2,modified_count_3,modified_count_4,modified_count_5,modified_count_6,modified_count_7,modified_count_8
0,00941713-c3a2-4d27-81dc-cd447ace4a47,0.158311,0.104992,0.284418,0.085794,0.288204,0.956225,0.342733,0.257241,0.119235,...,-1.067676,0.0,0.0,0.6,0.0,0.2,0.6,0.15,0.2,0.0
1,00aea02f-a95a-4c04-8be3-777461732cdf,0.235186,0.195519,0.381383,0.601936,0.668807,0.212292,0.556832,0.526469,0.555292,...,0.333724,0.2,0.05,2.7,0.2,0.2,0.8,0.3,0.4,0.2
2,04fdb619-d902-4e98-a5e9-a8198bfe047c,0.182626,0.353836,0.80564,0.824759,0.763438,0.815282,0.733843,0.426418,0.738563,...,1.016457,0.4,0.05,1.05,0.2,0.6,0.6,0.45,0.2,0.3
3,05029661-f8d8-441b-9cab-3c79f28a8b26,0.223914,0.607638,0.80564,0.920694,0.720973,0.988016,0.406387,1.0,0.812817,...,-1.067676,0.3,0.2,1.95,0.35,1.7,1.2,0.75,0.4,0.4
4,059a146e-a37c-498f-8c0b-5a78204249cb,0.190508,0.438151,0.843587,0.953558,0.710209,0.210283,0.333815,0.770213,0.555292,...,0.297791,0.0,0.1,1.05,0.2,0.3,0.0,0.15,0.2,0.1


#### Merging scores with features

#### Fitting a model

#### Predicting and Analyzing 

In [35]:
from joblib import load

regressor = load('412_rf_model.joblib')
scaler = load('scaler.joblib')

X = merged_df.iloc[:, 1:].to_numpy()
y_test_pred = regressor.predict(X)

adjusted_pred = scaler.inverse_transform(y_test_pred.reshape(-1, 1))
original_grade_pred = adjusted_pred + 70

codes = merged_df.iloc[:, 0].to_numpy()

code_grade_pairs = np.column_stack((codes, original_grade_pred.flatten()))

code_grade_df = pd.DataFrame(code_grade_pairs, columns=["Code", "Predicted Grade"])

print(code_grade_df)

code_grade_df.to_csv('output/test_grade_predictions.txt', sep=',', header=False, index=False)


                                     Code Predicted Grade
0    00941713-c3a2-4d27-81dc-cd447ace4a47            86.1
1    00aea02f-a95a-4c04-8be3-777461732cdf          97.425
2    04fdb619-d902-4e98-a5e9-a8198bfe047c           98.45
3    05029661-f8d8-441b-9cab-3c79f28a8b26          92.675
4    059a146e-a37c-498f-8c0b-5a78204249cb          95.575
..                                    ...             ...
183  fab774ac-38c8-4d86-910c-7ad0fa8470c5          90.825
184  fac3042d-d72d-43a7-9170-a424e3061fac           96.25
185  fbf473eb-ea6f-4a4a-b2d8-405bc09f9850          95.875
186  fccd270d-63f8-42b6-b73e-13f6d3e5f612           96.25
187  fe81cca3-d9c2-4d82-97a4-9cc1444ea219           95.85

[188 rows x 2 columns]
