# Lab

##### Objective : The main purpose behind this lab is to get familiar with NLP language models using Sklearn library.

## Part 1: Language Modeling / Regression

In [1]:
import pandas as pd
import numpy as np
import re
import string
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.svm import SVR
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

In [2]:
# Load the dataset
df = pd.read_csv('https://raw.githubusercontent.com/dbbrandt/short_answer_granding_capstone_project/master/data/sag/answers.csv')

In [3]:
df.head()

Unnamed: 0,id,answer,score,correct
0,1.1,High risk problems are address in the prototyp...,3.5,0.0
1,1.1,To simulate portions of the desired final prod...,5.0,1.0
2,1.1,A prototype program simulates the behaviors of...,4.0,1.0
3,1.1,Defined in the Specification phase a prototype...,5.0,1.0
4,1.1,It is used to let the users have a first idea ...,3.0,0.0


In [4]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2442 entries, 0 to 2441
Data columns (total 4 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   id       2442 non-null   float64
 1   answer   2442 non-null   object 
 2   score    2442 non-null   float64
 3   correct  2442 non-null   float64
dtypes: float64(3), object(1)
memory usage: 76.4+ KB


In [5]:
df.isnull().sum()

id         0
answer     0
score      0
correct    0
dtype: int64

In [6]:
df.duplicated().sum()

101

In [7]:
df.drop_duplicates(inplace=True)
df.duplicated().sum()

0

In [8]:
# Remove Non-String
def filter_non_string(df, column):
    """
    Filter out rows with non-string values in the specified column.
    Convert non-string values to strings.
    """
    df = df.dropna(subset=[column])
    df[column] = df[column].astype(str)
    return df

In [9]:
# Convert In LowerCase
def normalize_text(text):
    """Convert text to lowercase to ensure consistency across the corpus."""
    return text.lower()

In [10]:
# Remove HTML Tags
def remove_html_tags(text):
    """Remove HTML tags from text."""
    return re.sub(r'<.*?>', '', text)

In [11]:
# Remove URL Or HyperLink
def remove_urls(text):
    """Remove URLs or hyperlinks from the text."""
    return re.sub(r'http\S+|www\S+', '', text)

In [12]:
# Remove Numeric Digit
def remove_numbers(text):
    """Exclude numerical digits from the text."""
    return re.sub(r'\d+', '', text)

In [13]:
# Remove Punctuation
def remove_punctuation(text):
    """Remove punctuation marks from the text."""
    return text.translate(str.maketrans('', '', string.punctuation))

In [14]:
# Initialize the lemmatizer
lemmatizer = WordNetLemmatizer()

def lemmatize_text(text):
    # Tokenization
    tokens = word_tokenize(text)
    # Lemmatization
    lemmatized = [lemmatizer.lemmatize(word) for word in tokens]
    return " ".join(lemmatized)

In [15]:
# Split Text In Token
def tokenize_text(text):
    """Split the text into individual words or tokens."""
    return word_tokenize(text)

In [16]:
# Eliminate Stopwords
def remove_stopwords(tokens):
    """Eliminate common stopwords from the tokenized text."""
    stop_words = set(stopwords.words('english'))
    return [word for word in tokens if word not in stop_words]

In [17]:
def preprocess_text(df):
    df = filter_non_string(df, 'answer')
    df['answer'] = df['answer'].apply(normalize_text)
    df['answer'] = df['answer'].apply(remove_html_tags)
    df['answer'] = df['answer'].apply(remove_urls)
    df['answer'] = df['answer'].apply(remove_numbers)
    df['answer'] = df['answer'].apply(remove_punctuation)
    df['answer'] = df['answer'].apply(lemmatize_text)
    df['answer'] = df['answer'].apply(tokenize_text)
    df['answer'] = df['answer'].apply(remove_stopwords)
    return df

# Usage:
df_processed = preprocess_text(df)

In [18]:
df_processed.head()

Unnamed: 0,id,answer,score,correct
0,1.1,"[high, risk, problem, address, prototype, prog...",3.5,0.0
1,1.1,"[simulate, portion, desired, final, product, q...",5.0,1.0
2,1.1,"[prototype, program, simulates, behavior, port...",4.0,1.0
3,1.1,"[defined, specification, phase, prototype, sti...",5.0,1.0
4,1.1,"[used, let, user, first, idea, completed, prog...",3.0,0.0


In [19]:
from gensim.models import Word2Vec

# For Skip-gram, set sg=1
model = Word2Vec(sentences=df_processed['answer'], vector_size=100, window=5, min_count=1, workers=4, sg=1)

In [20]:
def document_vector(word2vec_model, doc):
    # Remove out-of-vocabulary words and get word vectors
    doc_vectors = [word2vec_model.wv[word] for word in doc if word in word2vec_model.wv]

    # Calculate the mean vector
    if doc_vectors:
        return np.mean(doc_vectors, axis=0)
    else:
        # Handle cases where no valid word vectors are found
        return np.zeros(word2vec_model.vector_size)

# Assuming 'df_processed' contains your preprocessed data
df_processed['vector'] = df_processed['answer'].apply(lambda x: document_vector(model, x))

In [21]:
# Drop the 'answer' column
df_processed.drop(columns=['answer'], inplace=True)

df_processed.head()

Unnamed: 0,id,score,correct,vector
0,1.1,3.5,0.0,"[-0.0014370893, 0.075638264, -0.048081852, 0.0..."
1,1.1,5.0,1.0,"[-0.018521447, 0.06257434, -0.052992724, 0.028..."
2,1.1,4.0,1.0,"[-0.009561417, 0.06691981, -0.0512529, 0.04687..."
3,1.1,5.0,1.0,"[-0.0072238813, 0.064176045, -0.050638255, 0.0..."
4,1.1,3.0,0.0,"[-0.02020907, 0.05126546, -0.034119498, 0.0329..."


In [22]:
# Split the data into features (X) and target (y)
X = df_processed['vector'].tolist()
y = df_processed['score']

In [23]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [27]:
# Initialize models
svr_model = SVR()
lr_model = LinearRegression()
dt_model = DecisionTreeRegressor()
rf_model = RandomForestRegressor()

In [25]:
# Set up the parameter grids for each model
param_grid_svr = {'C': [0.1, 1, 10], 'kernel': ['linear', 'poly', 'rbf']}
param_grid_lr = {}  # LinearRegression does not have hyperparameters to tune
param_grid_dt = {'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 10, 20]}
param_grid_rf = {'n_estimators': [10, 50, 100], 'max_depth': [None, 10, 20, 30], 'min_samples_split': [2, 10, 20]}

# Initialize GridSearchCV for each model
grid_search_svr = GridSearchCV(svr_model, param_grid_svr, cv=5, scoring='neg_mean_squared_error')
grid_search_lr = GridSearchCV(lr_model, param_grid_lr, cv=5, scoring='neg_mean_squared_error')
grid_search_dt = GridSearchCV(dt_model, param_grid_dt, cv=5, scoring='neg_mean_squared_error')
grid_search_rf = GridSearchCV(rf_model, param_grid_rf, cv=5, scoring='neg_mean_squared_error')

# Fit the models
grid_search_svr.fit(X_train, y_train)
grid_search_lr.fit(X_train, y_train)
grid_search_dt.fit(X_train, y_train)
grid_search_rf.fit(X_train, y_train)

# Best parameters and scores
best_params_svr = grid_search_svr.best_params_
best_score_svr = -grid_search_svr.best_score_

best_params_lr = grid_search_lr.best_params_
best_score_lr = -grid_search_lr.best_score_

best_params_dt = grid_search_dt.best_params_
best_score_dt = -grid_search_dt.best_score_

best_params_rf = grid_search_rf.best_params_
best_score_rf = -grid_search_rf.best_score_

# Print results
print("SVR Best Params:", best_params_svr)
print("SVR Best CV Score:", best_score_svr)

print("Linear Regression Best Params:", best_params_lr)
print("Linear Regression Best CV Score:", best_score_lr)

print("Decision Tree Best Params:", best_params_dt)
print("Decision Tree Best CV Score:", best_score_dt)

print("Random Forest Best Params:", best_params_rf)
print("Random Forest Best CV Score:", best_score_rf)

SVR Best Params: {'C': 10, 'kernel': 'rbf'}
SVR Best CV Score: 1.172725333467206
Linear Regression Best Params: {}
Linear Regression Best CV Score: 1.006498434110711
Decision Tree Best Params: {'max_depth': 10, 'min_samples_split': 20}
Decision Tree Best CV Score: 1.2911495431951052
Random Forest Best Params: {'max_depth': None, 'min_samples_split': 2, 'n_estimators': 100}
Random Forest Best CV Score: 0.9531643492829762


Now, let’s discuss the interpretation of these results:

- The **Random Forest** model achieved the lowest MSE, indicating better performance compared to the other models.
- The **SVR** model with an RBF kernel also performed reasonably well, but its MSE was slightly higher than that of Random Forest.
- **Linear Regression** had the highest MSE, suggesting that it may not be the best choice for this dataset.
- The **Decision Tree** model fell in between, with moderate performnce.
ance.ance.