# Random Search Optimization

In [32]:
DATASET = "../data/texas_dataset.xlsx"
RS_CV = 10
SEED = 42
TEST_SIZE = 0.2

In [75]:
import nltk
import pandas as pd
import re
import spacy
import numpy as np
from scipy.stats import randint, uniform
from sklearn.datasets import load_iris
from sklearn.model_selection import RandomizedSearchCV, train_test_split
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report
from tqdm.notebook import tqdm_notebook

In [34]:
tqdm_notebook.pandas()

## Loading

In [35]:
columns_dict = {
  "Topic": "topic",
  "Human Evaluation": "human_evaluation",
  "Round_score": "human_evaluation",
  "Question": "question",
  "Student Answer": "answer",
  "Question_ID": "question_id",
  "Instructor answers": "intructor_answers",
  "Score": "score"
}
dataset = pd.read_excel(DATASET)
dataset.rename(columns=columns_dict, inplace=True)
dataset.head()

Unnamed: 0,ID,question_id,question,intructor_answers,answer,score,human_evaluation
0,1,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,High risk problems are address in the prototy...,3.5,4
1,2,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,To simulate portions of the desired final pro...,5.0,5
2,3,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,A prototype program simulates the behaviors o...,4.0,4
3,4,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,Defined in the Specification phase a prototyp...,5.0,5
4,5,2023-01-01,What is the role of a prototype program in pr...,To simulate the behaviour of portions of the ...,It is used to let the users have a first idea...,3.0,3


In [36]:
dataset['answer'] = dataset['answer'].astype(str)
dataset['question'] = dataset['question'].astype(str)
dataset['question_id'] = dataset['question_id'].astype(str)
dataset.drop('score', inplace=True, axis=1)

## Text cleaning

In [37]:
nltk.download("stopwords")
stopwords = nltk.corpus.stopwords.words("english")

[nltk_data] Downloading package stopwords to
[nltk_data]     /home/hyanbatista42/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [38]:
def clean_text(text: str) -> str:
  # Remove punctuations and numbers
  text = re.sub(r'[^a-zA-Z]', ' ', text)

  # Remove single characters
  text = re.sub(r'\s+[a-zA-Z]\s+', ' ', text)

  # Remove multiple spaces
  text = re.sub(r'\s+', ' ', text)

  return text.strip().lower()

def remove_stopwords(text: str) -> str:
  return ' '.join([token for token in text.split() if token not in stopwords])

In [39]:
dataset["input"] = dataset.progress_apply(lambda row: row["question"] + " " + row["answer"], axis=1)
dataset["input"] = dataset["input"].progress_apply(lambda x: clean_text(remove_stopwords(x)))

  0%|          | 0/2442 [00:00<?, ?it/s]

  0%|          | 0/2442 [00:00<?, ?it/s]

### Data spliting

In [40]:
X = dataset.drop("human_evaluation", axis=1)
y = dataset["human_evaluation"]

In [41]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, shuffle=True, random_state=SEED, stratify=y)
len(X_train), len(X_test), len(y_train), len(y_test)

(1953, 489, 1953, 489)

In [42]:
y_test.unique()

array([5, 4, 3, 1, 2, 0])

### Dealing with unbalanced data

In [43]:
dataset = pd.concat([X_train, y_train], axis=1)
dataset.head()

Unnamed: 0,ID,question_id,question,intructor_answers,answer,input,human_evaluation
962,963,2023-01-06,What is a pointer?,A variable that contains the address in memory...,It is like a variable however instead of holdi...,what pointer it like variable however instead ...,5
2194,2195,2023-02-12,What is the experimental approach for measurin...,Implement the algorithm and measure the physic...,Experimental means you would actually write a ...,what experimental approach measuring running t...,5
46,47,2023-02-01,What stages in the software life cycle are in...,The testing stage can influence both the codi...,"Depending on how the work is done, Testing is...",what stages software life cycle influenced tes...,2
1426,1427,2023-05-08,Which implementation (array-based vs. list-bas...,"Link-based, because they are dynamic (no size ...",Array-based prevents the push operation from ...,which implementation array based vs list based...,5
2441,2442,2023-10-12,How many steps does it take to search a node i...,The height of the tree.,it depends on the install search tree then fro...,how many steps take search node binary search ...,2


In [44]:
dataset["human_evaluation"].value_counts()

human_evaluation
5    1239
4     329
3     232
2     113
1      21
0      19
Name: count, dtype: int64

In [45]:
label_5_ds = dataset[dataset["human_evaluation"] == 5]
label_5_ds = pd.concat([label_5_ds] * 1, axis=0, ignore_index=True)
len(label_5_ds)

1239

In [46]:
label_4_ds = dataset[dataset["human_evaluation"] == 4]
label_4_ds = pd.concat([label_4_ds] * 4, axis=0, ignore_index=True)
len(label_4_ds)

1316

In [47]:
label_3_ds = dataset[dataset["human_evaluation"] == 3]
label_3_ds = pd.concat([label_3_ds] * 5, axis=0, ignore_index=True)
len(label_3_ds)

1160

In [48]:
label_2_ds = dataset[dataset["human_evaluation"] == 2]
label_2_ds = pd.concat([label_2_ds] * 10, axis=0, ignore_index=True)
len(label_2_ds)

1130

In [49]:
label_1_ds = dataset[dataset["human_evaluation"] == 1]
label_1_ds = pd.concat([label_1_ds] * 50, axis=0, ignore_index=True)
len(label_1_ds)

1050

In [50]:
label_0_ds = dataset[dataset["human_evaluation"] == 0]
label_0_ds = pd.concat([label_0_ds] * 53, axis=0, ignore_index=True)
len(label_0_ds)

1007

In [51]:
dataset = pd.concat([label_0_ds, label_1_ds, label_2_ds, label_3_ds, label_4_ds, label_5_ds], axis=0)
len(dataset)

6902

In [52]:
dataset = dataset.sample(frac=1, random_state=SEED).reset_index(drop=True)
dataset.head()

Unnamed: 0,ID,question_id,question,intructor_answers,answer,input,human_evaluation
0,1223,2023-04-07,How are linked lists passed as arguments to a ...,By reference.,not answered,how linked lists passed arguments function ans...,0
1,1777,2023-04-10,What is a binary tree?,A tree for which the maximum number of childre...,A binary search tree is a tree that also has t...,what binary tree binary search tree tree also ...,5
2,2240,2023-03-12,Order the following functions by their running...,log(log n); 2^(log n) ; n^2 ; n^3; n!,longest to shortest:<br>n^3; n!; n^2; 2^(log n...,order following functions running time log log...,3
3,690,2023-02-04,What is the main difference between strings de...,The strings declared using an array of charact...,array it is the collection of similar data ty...,what main difference strings declared using ty...,3
4,2422,2023-10-12,How many steps does it take to search a node i...,The height of the tree.,2^n where n is the # of levels the binary tree...,how many steps take search node binary search ...,2


In [53]:
X_train = dataset.drop("human_evaluation", axis=1)
y_train = dataset["human_evaluation"]

In [54]:
X_train.to_csv("../data/X_train.csv")
y_train.to_csv("../data/y_train.csv")
X_test.to_csv("../data/X_test.csv")
y_test.to_csv("../data/y_test.csv")

## Encoding

In [58]:
nlp = spacy.load("en_core_web_lg")

In [63]:
doc = nlp(X_train.iloc[0]["input"])
doc.vector.shape

(300,)

In [70]:
X_train_encoded = np.array([nlp(x["input"]).vector for _, x in X_train.iterrows()], dtype=np.float32)
X_test_encoded = np.array([nlp(x["input"]).vector for _, x in X_test.iterrows()], dtype=np.float32)

## Training

### Random Forest

In [71]:
param_grid = {
    'n_estimators': randint(10, 200),
    'max_depth': randint(1, 20),
    'min_samples_split': randint(2, 20),
    'min_samples_leaf': randint(1, 20),
    'bootstrap': [True, False],
    'criterion': ['gini', 'entropy']
}

rf_clf = RandomForestClassifier()

random_search = RandomizedSearchCV(
    estimator=rf_clf,
    param_distributions=param_grid,
    n_iter=100,
    cv=RS_CV,
    random_state=SEED,
    n_jobs=-1,
)

random_search.fit(X_train_encoded, y_train)
best_rf_clf = random_search.best_estimator_
y_pred = best_rf_clf.predict(X_test_encoded)
report = classification_report(y_test, y_pred)
print(report)

              precision    recall  f1-score   support

           0       1.00      0.40      0.57         5
           1       1.00      0.40      0.57         5
           2       0.29      0.07      0.11        29
           3       0.45      0.09      0.14        58
           4       0.38      0.16      0.22        82
           5       0.69      0.96      0.80       310

    accuracy                           0.66       489
   macro avg       0.63      0.35      0.40       489
weighted avg       0.59      0.66      0.58       489



In [72]:
random_search.best_estimator_

### SVM

In [76]:
param_grid = {
    'C': uniform(0.1, 10),
    'kernel': ['linear', 'poly', 'rbf', 'sigmoid'],
    'gamma': ['scale', 'auto'],
    'degree': randint(1, 10)
}

svc = SVC()

random_search = RandomizedSearchCV(
    estimator=svc,
    param_distributions=param_grid,
    n_iter=100,
    cv=RS_CV,
    random_state=SEED,
    n_jobs=-1,
)

random_search.fit(X_train_encoded, y_train)
best_svc = random_search.best_estimator_
y_pred = best_svc.predict(X_test_encoded)
report = classification_report(y_test, y_pred)
print(report)

In [74]:
random_search.best_estimator_