# Aspirational Models Experiment 1.2

In [97]:
import pandas as pd
import numpy as np
import random
import warnings
import spacy
import nltk
from scipy.sparse import hstack, csr_matrix
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
import xgboost as xgb
from sklearn.metrics import precision_score, recall_score, f1_score, confusion_matrix, classification_report
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from nltk.tokenize import word_tokenize
from gensim.models import Word2Vec

import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.utils import shuffle

# Set random seed
random.seed(18)
seed = 18

# Ignore warnings
warnings.filterwarnings('ignore')


In [31]:
# Initialize spaCy
nlp = spacy.load('en_core_web_sm')

# Download necessary NLTK data
nltk.download('punkt')

[nltk_data] Downloading package punkt to
[nltk_data]     /Users/gbaldonado/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

## 1. Loading the data and quick exploratory data analysis

In [21]:
merged_aspirational_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Aspirational_sentence_level_batch_1_jaccard.csv", encoding='utf-8')

# Shuffle the merged dataset
merged_aspirational_df = shuffle(merged_aspirational_df, random_state=seed)

# # Train-test split 
# training_df, test_df = train_test_split(merged_aspirational_df, test_size=0.1, random_state=18, stratify=merged_aspirational_df['label'])

# training_df.reset_index(drop=True, inplace=True)
# test_df.reset_index(drop=True, inplace=True)

## 3. Feature Engineering

In [13]:
# Keywords for attainment and aspiration
attainment_keywords = ['degree', 'grades', 'career', 'job', 'medical school', 'achieve', 'CV', 'resume']
aspiration_keywords = ['goal', 'dream', 'hope', 'aspire', 'motivated', 'ambition', 'understanding', 'better person']

def keyword_features(text):
    features = {}
    for keyword in attainment_keywords:
        features[f'attainment_{keyword}'] = int(keyword in text.lower())
    for keyword in aspiration_keywords:
        features[f'aspiration_{keyword}'] = int(keyword in text.lower())
    return features

In [32]:
def dependency_features(doc):
    features = {}
    for token in doc:
        if token.dep_ in ['nsubj', 'dobj', 'iobj', 'amod']:
            features[f'deprel_{token.dep_}_{token.text}'] = 1
    return features

In [33]:
def pos_features(doc):
    features = {}
    for token in doc:
        features[f'pos_{token.pos_}_{token.text}'] = 1
    return features


In [66]:
from sklearn.feature_extraction.text import CountVectorizer

# N-gram vectorizer function
def ngram_features(corpus, ngram_range=(1, 2)):
    vectorizer = CountVectorizer(ngram_range=ngram_range, tokenizer=word_tokenize)
    X = vectorizer.fit_transform(corpus)
    feature_names = [str(i + 1) for i in range(X.shape[1])]  # Naming features as '1', '2', '3', ...
    return X, feature_names


In [40]:
# Train Word2Vec model on your corpus
def get_word2vec(corpus):
    tokenized_corpus = [word_tokenize(sentence) for sentence in corpus]
    model = Word2Vec(sentences=tokenized_corpus, vector_size=100, window=5, min_count=1, workers=4)
    return model

# Get the average word vector for a sentence
def sentence_vector(sentence, model):
    words = word_tokenize(sentence)
    word_vectors = [model.wv[word] for word in words if word in model.wv]
    if len(word_vectors) == 0:
        return np.zeros(model.vector_size)
    return np.mean(word_vectors, axis=0)


In [98]:
# Combine all feature extraction functions
def extract_features(df, word2vec_model, ngram_range=(1, 2)):
    keyword_feats = []
    dependency_feats = []
    pos_feats = []
    sent_vectors = []
    
    for text in df['sentence']:
        doc = nlp(text)
        
        # Keyword features
        kw_features = keyword_features(text)
        keyword_feats.append(kw_features)
        
        # Dependency features
        dep_features = dependency_features(doc)
        dependency_feats.append(dep_features)
        
        # POS features
        pos_features_dict = pos_features(doc)
        pos_feats.append(pos_features_dict)
        
        # Sentence vector (semantic similarity)
        sent_vector = sentence_vector(text, word2vec_model)
        sent_vectors.append(sent_vector)
    
    # Convert to DataFrame
    keyword_df = pd.DataFrame(keyword_feats).fillna(0)
    dependency_df = pd.DataFrame(dependency_feats).fillna(0)
    pos_df = pd.DataFrame(pos_feats).fillna(0)
    sent_vectors_df = pd.DataFrame(sent_vectors)
    
    # N-gram features
    ngram_X, ngram_feature_names = ngram_features(df['sentence'], ngram_range)
    
    # Combine all features
    keyword_df_sparse = csr_matrix(keyword_df)
    dependency_df_sparse = csr_matrix(dependency_df)
    pos_df_sparse = csr_matrix(pos_df)
    sent_vectors_df_sparse = csr_matrix(sent_vectors_df)
    
    X_combined = hstack([keyword_df_sparse, dependency_df_sparse, pos_df_sparse, sent_vectors_df_sparse, ngram_X])
    X_combined_df = pd.DataFrame.sparse.from_spmatrix(X_combined)
    X_combined_df.columns = list(keyword_df.columns) + list(dependency_df.columns) + list(pos_df.columns) + list(sent_vectors_df.columns) + ngram_feature_names
    
    return X_combined_df

In [99]:
corpus = merged_aspirational_df['sentence'].tolist()
word2vec_model = get_word2vec(corpus)

In [100]:
# Extract features
feature_df = extract_features(merged_aspirational_df, word2vec_model)

In [102]:
feature_df.columns

Index([        'attainment_degree',         'attainment_grades',
               'attainment_career',            'attainment_job',
       'attainment_medical school',        'attainment_achieve',
                   'attainment_CV',         'attainment_resume',
                 'aspiration_goal',          'aspiration_dream',
       ...
                           '30298',                     '30299',
                           '30300',                     '30301',
                           '30302',                     '30303',
                           '30304',                     '30305',
                           '30306',                     '30307'],
      dtype='object', length=36900)

In [103]:
labels = merged_aspirational_df["label"]

# Combine extracted features with the label
final_df = pd.concat([feature_df, pd.Series(labels, name='label')], axis=1)
final_df

Unnamed: 0,attainment_degree,attainment_grades,attainment_career,attainment_job,attainment_medical school,attainment_achieve,attainment_CV,attainment_resume,aspiration_goal,aspiration_dream,...,30299,30300,30301,30302,30303,30304,30305,30306,30307,label
0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
1,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
2,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
3,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1
4,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
4699,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4700,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4701,1.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0
4702,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0


In [104]:
final_df.columns = final_df.columns.astype(str)

In [105]:
# Splitting the data
X = final_df.drop('label', axis=1)
y = final_df['label']

In [138]:
# Split the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1, random_state=seed)

In [139]:
print(X_train.shape)

(4233, 36900)


In [140]:
print(X_test.shape)

(471, 36900)


In [141]:
# Define hyperparameters for each model
log_reg_params = {
    'classifier__C': [0.01, 0.1, 1, 10, 100],
    'classifier__penalty' : ["l2"],
    'classifier__solver': ['liblinear']
}

rf_params = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [5, 10, 15],
    'classifier__min_samples_split': [2, 10],
    'classifier__min_samples_leaf': [1, 5]
}

xgb_params = {
    'classifier__n_estimators': [50, 100, 200],
    'classifier__max_depth': [3, 5, 7],
    'classifier__learning_rate': [0.01, 0.1, 0.3]
}



In [142]:
# Define the classifiers
log_reg = Pipeline([
    ('classifier', LogisticRegression(class_weight='balanced', max_iter=10000, random_state=seed))
])

rf = Pipeline([
    ('classifier', RandomForestClassifier(class_weight='balanced', random_state=seed))
])

xgboost = Pipeline([
    ('classifier', xgb.XGBClassifier(class_weight='balanced', random_state=seed, use_label_encoder=False, eval_metric='logloss'))
])


In [143]:
# Perform grid search with cross-validation
def perform_grid_search(pipeline, params, X_train, y_train):
    grid_search = GridSearchCV(pipeline, param_grid=params, cv=10, n_jobs=-1, verbose=1, scoring='f1_macro')
    grid_search.fit(X_train, y_train)
    return grid_search

# Evaluate the model
def evaluate_model(model, X_test, y_test):
    y_pred = model.predict(X_test)
    precision = precision_score(y_test, y_pred, average='macro')
    recall = recall_score(y_test, y_pred, average='macro')
    f1 = f1_score(y_test, y_pred, average='macro')
    
    # Confusion matrix
    cm = confusion_matrix(y_test, y_pred)
    tn, fp, fn, tp = cm.ravel()
    print(cm)
    print(f"True Positives: {tp}, False Positives: {fp}, True Negatives: {tn}, False Negatives: {fn}")
    # Classification report
    class_report = classification_report(y_test, y_pred)
    print(class_report)
    
    return precision, recall, f1, cm, class_report


In [144]:
# Store results
results = []

In [145]:
# Logistic Regression
log_reg_grid = perform_grid_search(log_reg, log_reg_params, X_train, y_train)
log_reg_precision, log_reg_recall, log_reg_f1, log_cm, log_class_report = evaluate_model(log_reg_grid, X_test, y_test)
results.append({
    'Algorithm': 'Logistic Regression',
    'Avg Precision': log_reg_precision,
    'Avg Recall': log_reg_recall,
    'Macro F1 Score': log_reg_f1,
    'Confusion Matrix': log_cm,
    'classification_report': log_class_report,
})


Fitting 10 folds for each of 5 candidates, totalling 50 fits
[[366  46]
 [ 56   3]]
True Positives: 3, False Positives: 46, True Negatives: 366, False Negatives: 56
              precision    recall  f1-score   support

           0       0.87      0.89      0.88       412
           1       0.06      0.05      0.06        59

    accuracy                           0.78       471
   macro avg       0.46      0.47      0.47       471
weighted avg       0.77      0.78      0.77       471



In [146]:
# Random Forest
rf_grid = perform_grid_search(rf, rf_params, X_train, y_train)
rf_precision, rf_recall, rf_f1 = evaluate_model(rf_grid, X_test, y_test)
results.append({
    'Algorithm': 'Random Forest',
    'Avg Precision': rf_precision,
    'Avg Recall': rf_recall,
    'Macro F1 Score': rf_f1
})

Fitting 10 folds for each of 36 candidates, totalling 360 fits
[[374  38]
 [ 56   3]]
True Positives: 3, False Positives: 38, True Negatives: 374, False Negatives: 56
              precision    recall  f1-score   support

           0       0.87      0.91      0.89       412
           1       0.07      0.05      0.06        59

    accuracy                           0.80       471
   macro avg       0.47      0.48      0.47       471
weighted avg       0.77      0.80      0.78       471



ValueError: too many values to unpack (expected 3)

In [None]:
# XGBoost
xgb_grid = perform_grid_search(xgboost, xgb_params, X_train, y_train)
xgb_precision, xgb_recall, xgb_f1 = evaluate_model(xgb_grid, X_test, y_test)
results.append({
    'Algorithm': 'XGBoost',
    'Avg Precision': xgb_precision,
    'Avg Recall': xgb_recall,
    'Macro F1 Score': xgb_f1
})

In [None]:
# Save results to DataFrame
results_df = pd.DataFrame(results)
print(results_df)