# Feature Engineering for Exp 2.0 Batch 1 + Batch 2

In [1]:
import pandas as pd
import numpy as np
import random
import re
import matplotlib.pyplot as plt
import plotly.express as px
import seaborn as sns
import csv
import pickle
import warnings
import stanza

from random import shuffle
from nltk import word_tokenize,pos_tag
from nltk.corpus import stopwords, wordnet
from nltk.stem import WordNetLemmatizer
from textblob import TextBlob
from collections import Counter
from gensim.models import Word2Vec

from sklearn.model_selection import train_test_split, GridSearchCV, learning_curve
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.metrics import confusion_matrix, classification_report, roc_auc_score, f1_score, r2_score, make_scorer
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.utils import shuffle

from imblearn.over_sampling import RandomOverSampler
from imblearn.under_sampling import RandomUnderSampler

# Set random seed
random.seed(18)
seed = 18

# Ignore warnings
warnings.filterwarnings('ignore')

# Display options
pd.set_option('display.max_colwidth', None)

# Initialize lemmatizer, stop words, and stanza
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))
stanza.download('en') # download English model
nlp = stanza.Pipeline('en') # initialize English neural pipeline


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-10-24 21:47:17 INFO: Downloaded file to /Users/gbaldonado/stanza_resources/resources.json
2024-10-24 21:47:17 INFO: Downloading default packages for language: en (English) ...
2024-10-24 21:47:18 INFO: File exists: /Users/gbaldonado/stanza_resources/en/default.zip
2024-10-24 21:47:21 INFO: Finished downloading models and saved to /Users/gbaldonado/stanza_resources
2024-10-24 21:47:21 INFO: Checking for updates to resources.json in case models have been updated.  Note: this behavior can be turned off with download_method=None or download_method=DownloadMethod.REUSE_RESOURCES


Downloading https://raw.githubusercontent.com/stanfordnlp/stanza-resources/main/resources_1.8.0.json:   0%|   …

2024-10-24 21:47:21 INFO: Downloaded file to /Users/gbaldonado/stanza_resources/resources.json
2024-10-24 21:47:22 INFO: Loading these models for language: en (English):
| Processor    | Package                   |
--------------------------------------------
| tokenize     | combined                  |
| mwt          | combined                  |
| pos          | combined_charlm           |
| lemma        | combined_nocharlm         |
| constituency | ptb3-revised_charlm       |
| depparse     | combined_charlm           |
| sentiment    | sstplus_charlm            |
| ner          | ontonotes-ww-multi_charlm |

2024-10-24 21:47:22 INFO: Using device: cpu
2024-10-24 21:47:22 INFO: Loading: tokenize
2024-10-24 21:47:22 INFO: Loading: mwt
2024-10-24 21:47:22 INFO: Loading: pos
2024-10-24 21:47:22 INFO: Loading: lemma
2024-10-24 21:47:23 INFO: Loading: constituency
2024-10-24 21:47:23 INFO: Loading: depparse
2024-10-24 21:47:23 INFO: Loading: sentiment
2024-10-24 21:47:23 INFO: Loading: 

## 1. Loading the data and quick exploratory data analysis

In [2]:
merged_aspirational_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Aspirational_sentence_level_batch_1_jaccard.csv", encoding='utf-8')
merged_aspirational_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/Aspirational Plus_sentence_level_batch_2_jaccard.csv", encoding='utf-8')
merged_aspirational_df = pd.concat([merged_aspirational_df_batch_1, merged_aspirational_df_batch_2])

merged_familial_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Familial_sentence_level_batch_1_jaccard.csv", encoding='utf-8')
merged_familial_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/Familial Plus_sentence_level_batch_2_jaccard.csv", encoding='utf-8')
merged_familial_df = pd.concat([merged_familial_df_batch_1, merged_familial_df_batch_2])

merged_navigational_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Navigational_sentence_level_batch_1_jaccard.csv", encoding='utf-8')
merged_navigational_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/Navigational Plus_sentence_level_batch_2_jaccard.csv", encoding='utf-8')
merged_navigational_df = pd.concat([merged_navigational_df_batch_1, merged_navigational_df_batch_2])

merged_resistance_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Resistance_sentence_level_batch_1_jaccard.csv", encoding='utf-8')
merged_resistance_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/Resistance Plus_sentence_level_batch_2_jaccard.csv", encoding='utf-8')
merged_resistance_df = pd.concat([merged_resistance_df_batch_1, merged_resistance_df_batch_2])

merged_social_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/merged_Social_sentence_level_batch_1_jaccard.csv", encoding='utf-8')
merged_social_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/merged_themes_using_jaccard_method/Social Plus_sentence_level_batch_2_jaccard.csv", encoding='utf-8')
merged_social_df = pd.concat([merged_social_df_batch_1, merged_social_df_batch_2])

merged_dataset_dict = {"Aspirational": merged_aspirational_df,
                  "Familial": merged_familial_df,
                  "Navigational": merged_navigational_df,
                  "Resistance": merged_resistance_df,
                  "Social": merged_social_df
                  }


def prepare_data(df, test_size=0.1, resample=True, strategy='undersample', seed=18):
    """
    Prepare training and testing datasets by shuffling, splitting, and resampling if necessary.

    Parameters:
    - df: The input DataFrame containing the merged dataset.
    - test_size: Proportion of the dataset to include in the test split.
    - resample: Boolean indicating whether to apply resampling.
    - strategy: 'oversample' or 'undersample' if resampling is enabled.
    - seed: Random seed for reproducibility.

    Returns:
    - training_df: DataFrame containing the training set.
    - test_df: DataFrame containing the test set.
    """

    # Shuffle the merged dataset
    df = shuffle(df, random_state=seed)

    # Train-test split
    training_df, test_df = train_test_split(
        df,
        test_size=test_size,
        random_state=seed,
        stratify=df['label']
    )

    # Function for undersampling or oversampling with a target ratio for oversampling
    def resample_data(X, y, strategy='oversample', random_state=None, minority_ratio=0.3):
        """
        Resample data using oversampling or undersampling.

        Parameters:
        X: Features
        y: Labels
        strategy: 'oversample' or 'undersample'
        random_state: Seed for random state
        minority_ratio: Target ratio of minority class after oversampling (0.2 to 0.3 recommended)

        Returns:
        X_resampled: Resampled features
        y_resampled: Resampled labels
        """
        # Ensure minority_ratio is within the desired range
        if not (0.2 <= minority_ratio <= 0.3):
            raise ValueError("Minority ratio must be between 0.2 and 0.3")

        # Calculate the desired sampling strategy
        # For oversampling, we calculate how many samples we want for the minority class
        if strategy == 'oversample':
            # Minority class will be resampled to be `minority_ratio` of the total
            # i.e., minority / (minority + majority) = minority_ratio
            majority_class_count = sum(y == max(set(y), key=list(y).count))  # Majority class count
            desired_minority_count = int((minority_ratio / (1 - minority_ratio)) * majority_class_count)
            sampling_strategy = {min(set(y), key=list(y).count): desired_minority_count}
            sampler = RandomOverSampler(sampling_strategy=sampling_strategy, random_state=random_state)

        elif strategy == 'undersample':
            minority_class_count = sum(y == min(set(y), key=list(y).count))  # Minority class count
            desired_majority_count = int(((1 - minority_ratio) / minority_ratio) * minority_class_count)
            sampling_strategy = {max(set(y), key=list(y).count): desired_majority_count}
            sampler = RandomUnderSampler(sampling_strategy=sampling_strategy, random_state=random_state)

        else:
            raise ValueError("Strategy must be 'oversample' or 'undersample'")

        X_resampled, y_resampled = sampler.fit_resample(X, y)
        return X_resampled, y_resampled

    # Separate features and labels
    X = training_df.drop(columns=['label'])  # Replace 'label' with your target column name
    y = training_df['label']

    # Toggle resampling
    if resample:
        # Apply resampling (choose 'oversample' or 'undersample')
        X_resampled, y_resampled = resample_data(X, y, strategy=strategy, random_state=seed)

        # Combine resampled data into a single DataFrame
        training_df = pd.concat([pd.DataFrame(X_resampled, columns=X.columns), pd.Series(y_resampled, name='label')], axis=1)

    # Reset the index of the DataFrames
    training_df.reset_index(drop=True, inplace=True)
    test_df.reset_index(drop=True, inplace=True)

    print("Training and test sets loaded.")

    print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")
    pos_labels_train = len(training_df[training_df['label'] == 1])
    print(f"Positive labels present in the training dataset: {pos_labels_train} out of {len(training_df)} or {pos_labels_train/len(training_df)*100:.2f}%")
    pos_labels_test = len(test_df[test_df['label'] == 1])
    print(f"Positive labels present in the test dataset: {pos_labels_test} out of {len(test_df)} or {pos_labels_test/len(test_df)*100:.2f}%")

    return training_df, test_df

In [3]:
# Create a dummy subset function that ensures at least 2 samples from each class
def create_balanced_dummy_subset(df, label_column='label', n_majority=20, n_minority=8):
    """Create a subset ensuring at least 2 samples from the minority class."""
    
    # Split the dataset by class
    majority_class = df[df[label_column] == 0]
    minority_class = df[df[label_column] == 1]
    
    # Check the sizes of the classes to avoid sampling more than available
    n_minority = min(n_minority, len(minority_class))
    n_majority = min(n_majority, len(majority_class))
    
    # Sample from both classes
    minority_sample = minority_class.sample(n=n_minority, random_state=42)
    majority_sample = majority_class.sample(n=n_majority, random_state=42)
    
    # Combine the samples and shuffle the dataset
    balanced_subset = pd.concat([majority_sample, minority_sample]).sample(frac=1, random_state=42)
    
    return balanced_subset

# Initialize a dictionary to hold the dummy subsets
dummy_dataset_dict = {}

# Loop through each theme and create a balanced dummy subset
for theme_name, df in merged_dataset_dict.items():
    dummy_dataset_dict[theme_name] = create_balanced_dummy_subset(df, label_column='label', n_majority=20, n_minority=8)
    print(f"Dummy subset for {theme_name} created with {dummy_dataset_dict[theme_name].shape[0]} rows.")

# Optionally display the first few rows of one of the dummy datasets to verify
print(dummy_dataset_dict["Aspirational"].head())


Dummy subset for Aspirational created with 28 rows.
Dummy subset for Familial created with 28 rows.
Dummy subset for Navigational created with 28 rows.
Dummy subset for Resistance created with 28 rows.
Dummy subset for Social created with 28 rows.
                                                                                                                                                                                                                                      sentence  \
2515                                                                                                                                                                                by that i mean i want to educate myself more biochemistry.   
2306  i decided when i was younger that i wanted to be able to live a comfortable lifestyle when i am older and so in order to do that i have to get a job that i not only love, but also will be able to support myself and my future family.   
4238                      

## 2. Feature Engineering

### 1. NER

In [4]:
def get_ner(text):
    ner_list = []
    # Annotate the text using stanza
    doc = nlp(text)

    for sentence in doc.sentences:
        for entity in sentence.ents:
            if entity.type == 'PERSON':
                ner_list.append(entity.text)

    return ner_list

# Example usage
text = "Barack Obama was the 44th doctor of the United States."
print(get_ner(text))

['Barack Obama']


In [5]:
# check if a named entity is present in the sentence
def named_entity_present(sentence):
    ner_list = get_ner(sentence)
    if len(ner_list) > 0:
        return 1
    else:
        return 0

### 2. Similarity Features

In [6]:
# A helper function to get the similar words and similarity score
# The function takes tokens of sentence as input and if its not a stop word, get its similarity with synsets of STEM.
stop_words = set(stopwords.words('english'))
stop_words |= set(["help","try", "work", "process", "support", "job"] )
def word_similarity(tokens, syns, field):    
    if field in ['engineering', 'technology']:
        score_threshold = 0.5
    else:
        score_threshold = 0.2
    sim_words = 0
    for token in tokens:
        if token not in stop_words:
            try:
                syns_word = wordnet.synsets(token) 
                score = syns_word[0].path_similarity(syns[0])
                if score >= score_threshold:
                    sim_words += 1
            except: 
                score = 0
    
    return sim_words

In [7]:
# Functions to create columns for similarity based on all STEM fields
syns_bio = wordnet.synsets(lemmatizer.lemmatize("biology"))
syns_maths = wordnet.synsets(lemmatizer.lemmatize("mathematics")) 
syns_tech = wordnet.synsets(lemmatizer.lemmatize("technology"))
syns_eng = wordnet.synsets(lemmatizer.lemmatize("engineering"))
syns_chem = wordnet.synsets(lemmatizer.lemmatize("chemistry"))
syns_phy = wordnet.synsets(lemmatizer.lemmatize("physics"))
syns_sci = wordnet.synsets(lemmatizer.lemmatize("science"))

### 3. Medical Word Count

In [8]:
# Load the medical specialization text file and create a list
medical_list = []
with open('/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/features/medical_specialities.txt', 'r') as medical_fields:
    for line in medical_fields.readlines():
        special_field = line.rstrip('\n')
        special_field = re.sub("\W"," ", special_field )
#         print(special_field)
        medical_list += special_field.split()
medical_list = list(set(medical_list))  
medical_list = [x.lower() for x in medical_list]
print(medical_list)

['neuroradiology', 'occupational', 'hospice', 'gastroenterology', 'rheumatology', 'dermatology', 'banking', 'urology', 'forensic', 'retardation', 'calculi', 'neuropathology', 'hematology', 'neurology', 'cornea', 'sleep', 'endocrinology', 'geriatric', 'ophthalmology', 'male', 'psychiatry', 'oncology', 'surgery', 'internal', 'critical', 'microbiology', 'interventional', 'neurophysiology', 'public', 'anterior', 'internal', 'neonatal', 'anesthesiology', 'endovascular', 'uveitis', 'immunopathology', 'heart', 'oculoplastics', 'infectious', 'rheumatology', 'pediatric', 'liaison', 'reproductive', 'glaucoma', 'procedural', 'genetics', 'maternal', 'plastic', 'advanced', 'cardiac', 'neurodevelopmental', 'endocrinology', 'fetal', 'cardiology', 'sports', 'consultation', 'psychosomatic', 'neuromuscular', 'infertility', 'cytopathology', 'physical', 'disabilities', 'imaging', 'musculoskeletal', 'diabetes', 'child', 'ophthalmology', 'care', 'pathology', 'preventive', 'gastroenterology', 'orbit', 'elect

In [9]:
# A helper function to get medical words
def check_medical_words(tokens):
    for token in tokens:
        if token not in stop_words and token in [x.lower() for x in medical_list]:
            return 1
        
    return 0

### 4. Sentiment Polarity and Subjectivity

In [10]:
# A helper function to get polarity and subjectivity of the sentence using TexBlob
def get_sentiment(sentence):
    sentiments =TextBlob(sentence).sentiment
    polarity = sentiments.polarity
    subjectivity = sentiments.subjectivity
    return polarity, subjectivity

### 5. POS Tag Count

In [11]:
# A helper function to get the count of POS tags of the sentence
def count_pos_tags(tokens):
    token_pos = pos_tag(tokens)
    count = Counter(tag for word,tag in token_pos)
    interjections =  count['UH']
    nouns = count['NN'] + count['NNS'] + count['NNP'] + count['NNPS']
    adverb = count['RB'] + count['RBS'] + count['RBR']
    verb = count['VB'] + count['VBD'] + count['VBG'] + count['VBN']
    determiner = count['DT']
    pronoun = count['PRP']
    adjetive = count['JJ'] + count['JJR'] + count['JJS']
    preposition = count['IN']
    return interjections, nouns, adverb, verb, determiner, pronoun, adjetive,preposition

In [12]:
def pos_tag_extraction(dataframe, field, func, column_names):
    return pd.concat((
        dataframe,
        dataframe[field].apply(
            lambda cell: pd.Series(func(cell), index=column_names))), axis=1)

### 6. Word Embeddings

In [13]:
# Load the w2v dict from pickle file
with open('/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/features/pickle/embeddings06122024.pickle', 'rb') as w2v_file:
    w2v_dict = pickle.load(w2v_file)

In [14]:
print("length of word embeddings: ", len(w2v_dict.keys()))

length of word embeddings:  4762


In [15]:
# Get the vectors for the essay
def vectorizer(sequence):
    vect = []
    numw = 0
    for w in sequence: 
        try :
            if numw == 0:
                vect = w2v_dict[w]
            else:
                vect = np.add(vect, w2v_dict[w])
            numw += 1
        except Exception as e:
            pass

    return vect/ numw 

In [16]:
# Function to split text into words
def split_into_words(text):
    return text.split()

### 7. Unigrams

In [17]:
# Define the vectorizer
unigram_vect = CountVectorizer(ngram_range=(1, 1), min_df=2, stop_words = 'english')

In [18]:
def append_unigram(X_TRAIN, X_TEST):
    # Fit the unigram vectorizer on the training set
    unigram_matrix = unigram_vect.fit_transform(X_TRAIN['sentence'])
    
    # Convert unigram matrix to DataFrame with proper column names
    unigrams = pd.DataFrame(unigram_matrix.toarray(), columns=unigram_vect.get_feature_names_out())
    unigrams = unigrams.reset_index(drop=True)
    
    # Append unigrams to the training data
    X_train = pd.concat([X_TRAIN.reset_index(drop=True), unigrams], axis=1)
    print("Shape of the unigram df for train: ", unigrams.shape)
    
    # Transform the test set with the fitted vectorizer
    unigram_matrix_test = unigram_vect.transform(X_TEST['sentence'])
    
    # Convert test unigram matrix to DataFrame with proper column names
    unigrams_test = pd.DataFrame(unigram_matrix_test.toarray(), columns=unigram_vect.get_feature_names_out())
    unigrams_test = unigrams_test.reset_index(drop=True)
    
    # Append unigrams to the test data
    X_test = pd.concat([X_TEST.reset_index(drop=True), unigrams_test], axis=1)
    print("Test unigram df shape: ", unigrams_test.shape)
    
    return X_train, X_test

### 8. Putting them all together

In [19]:
from nltk.tokenize import word_tokenize

# Optimized wrapper function for feature engineering using list comprehensions
def feature_engineering(original_dataset):

    dataset = original_dataset.copy()

    # 1. Tokenization: Use list comprehension instead of apply
    dataset['tokens'] = [word_tokenize(sentence) for sentence in dataset['sentence']]

    # 2. Similarity features: Process all similarity features in a single loop using list comprehension
    similarity_columns = ['bio_sim_words', 'chem_sim_words', 'phy_sim_words', 'math_sim_words', 'tech_sim_words', 'eng_sim_words']
    sim_functions = [(syns_bio, 'biology'), (syns_chem, 'chemistry'), (syns_phy, 'physics'), (syns_maths, 'mathematics'),
                     (syns_tech, 'technology'), (syns_eng, 'engineering')]

    # Create similarity feature columns using list comprehensions
    for col_name, (syns, label) in zip(similarity_columns, sim_functions):
        dataset[col_name] = [word_similarity(tokens, syns, label) for tokens in dataset['tokens']]

    # 3. Medical terms: Use list comprehension to check medical terms
    dataset['medical_terms'] = [check_medical_words(tokens) for tokens in dataset['tokens']]

    # 4. Polarity and subjectivity: Use list comprehension for sentiment analysis
    sentiment_results = [get_sentiment(sentence) for sentence in dataset['sentence']]
    dataset['polarity'] = [result[0] for result in sentiment_results]
    dataset['subjectivity'] = [result[1] for result in sentiment_results]

    # 5. Named entity recognition: Use list comprehension for NER
    dataset['ner'] = [named_entity_present(sentence) for sentence in dataset['sentence']]

    # 6. POS tag count: Assuming pos_tag_extraction cannot be vectorized, use it as-is
    dataset = pos_tag_extraction(dataset, 'tokens', count_pos_tags, 
                                 ['interjections', 'nouns', 'adverb', 'verb', 'determiner', 'pronoun', 'adjective', 'preposition'])

    # 7. Labels and Features (X and y split)
    data_labels = dataset['label']
    data_x = dataset.drop(columns='label')

    # 8. Vectorization of tokens using list comprehension for efficient vectorization
    vect_arr = np.array([vectorizer(tokens) for tokens in data_x['tokens']])

    # 9. Create embedding DataFrame using the vectorized embeddings
    embedding_df = pd.DataFrame(vect_arr.tolist(), index=data_x.index)
    embedding_df.columns = [f'embedding_{i}' for i in range(embedding_df.shape[1])]

    # 10. Concatenate embedding features with the original data
    data_x = pd.concat([data_x.reset_index(drop=True), embedding_df], axis=1)

    return data_x, data_labels

In [21]:
import os 

# Initialize an empty dictionary to hold training and test DataFrames for each theme
theme_dfs = {}

# Define base directory where subfolders for each theme will be saved
base_directory = 'theme_features'

# List of themes to ensure subfolders exist for each one
theme_list = ['aspirational', 'familial', 'navigational', 'resistance', 'social']

# Ensure that the base directory exists, create if it doesn't
if not os.path.exists(base_directory):
    os.makedirs(base_directory)

# Loop through each theme and its corresponding merged dataset
for theme_name, merged_df in merged_dataset_dict.items():
    print(f"Processing theme: {theme_name}")

    # Ensure subfolder exists for the current theme
    theme_folder = os.path.join(base_directory, theme_name)
    if not os.path.exists(theme_folder):
        os.makedirs(theme_folder)

    # Call the prepare_data function and get the training and test DataFrames
    training_df, test_df = prepare_data(merged_df, test_size=0.1, resample=False, strategy='oversample', seed=18)

    # Print dataset shape information
    print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")

    # Count and print the number of positive labels in training and test sets
    pos_labels_train = training_df['label'].sum()
    print(f"Positive labels in training set: {pos_labels_train} out of {len(training_df)} ({(pos_labels_train / len(training_df)) * 100:.2f}%)")

    pos_labels_test = test_df['label'].sum()
    print(f"Positive labels in test set: {pos_labels_test} out of {len(test_df)} ({(pos_labels_test / len(test_df)) * 100:.2f}%)")

    # Feature engineering
    print(" - Feature engineering...")
    X_train, y_train = feature_engineering(training_df)
    X_test, y_test = feature_engineering(test_df)

    # Ensure that the labels are integers
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')


    # Append unigrams (or perform other feature modifications)
    X_train_final, X_test_final = append_unigram(X_train, X_test)

    # Store the X and y train and test DataFrames in the dictionary as a tuple
    theme_dfs[theme_name] = (X_train_final, y_train, X_test_final, y_test)

    # Save each DataFrame as CSV in the respective theme subfolder
    X_train_file = os.path.join(theme_folder, 'X_train.csv')
    y_train_file = os.path.join(theme_folder, 'y_train.csv')
    X_test_file = os.path.join(theme_folder, 'X_test.csv')
    y_test_file = os.path.join(theme_folder, 'y_test.csv')

    X_train_final.to_csv(X_train_file, index=False)
    y_train.to_csv(y_train_file, index=False)
    X_test_final.to_csv(X_test_file, index=False)
    y_test.to_csv(y_test_file, index=False)

    print(f" - Saved feature datasets for theme '{theme_name}' to folder '{theme_folder}'.")

# Display the resulting dictionary
print("X and y train and test DataFrames stored for each theme:")
for theme, dfs in theme_dfs.items():
    print(f"{theme}: X_train shape: {dfs[0].shape}, y_train shape: {dfs[1].shape}, X_test shape: {dfs[2].shape}, y_test shape: {dfs[3].shape}")

Processing theme: Aspirational
Training and test sets loaded.
Training dataset shape: (8856, 3) 
Test dataset shape: (985, 3)
Positive labels present in the training dataset: 805 out of 8856 or 9.09%
Positive labels present in the test dataset: 89 out of 985 or 9.04%
Training dataset shape: (8856, 3) 
Test dataset shape: (985, 3)
Positive labels in training set: 805 out of 8856 (9.09%)
Positive labels in test set: 89 out of 985 (9.04%)
 - Feature engineering...


TypeError: unsupported operand type(s) for /: 'list' and 'int'

In [None]:
import os 

# Initialize an empty dictionary to hold training and test DataFrames for each theme
oversample_theme_dfs = {}

# Define base directory where subfolders for each theme will be saved
base_directory = 'oversample_theme_features'

# List of themes to ensure subfolders exist for each one
theme_list = ['aspirational', 'familial', 'navigational', 'resistance', 'social']

# Ensure that the base directory exists, create if it doesn't
if not os.path.exists(base_directory):
    os.makedirs(base_directory)

# Loop through each theme and its corresponding merged dataset
for theme_name, merged_df in merged_dataset_dict.items():
    print(f"Processing theme: {theme_name}")

    # Ensure subfolder exists for the current theme
    theme_folder = os.path.join(base_directory, theme_name)
    if not os.path.exists(theme_folder):
        os.makedirs(theme_folder)

    # Call the prepare_data function and get the training and test DataFrames
    training_df, test_df = prepare_data(merged_df, test_size=0.1, resample=True, strategy='oversample', seed=18)

    # Print dataset shape information
    print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")

    # Count and print the number of positive labels in training and test sets
    pos_labels_train = training_df['label'].sum()
    print(f"Positive labels in training set: {pos_labels_train} out of {len(training_df)} ({(pos_labels_train / len(training_df)) * 100:.2f}%)")

    pos_labels_test = test_df['label'].sum()
    print(f"Positive labels in test set: {pos_labels_test} out of {len(test_df)} ({(pos_labels_test / len(test_df)) * 100:.2f}%)")

    # Feature engineering
    print(" - Feature engineering...")
    X_train, y_train = feature_engineering(training_df)
    X_test, y_test = feature_engineering(test_df)

    # Ensure that the labels are integers
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')


    # Append unigrams (or perform other feature modifications)
    X_train_final, X_test_final = append_unigram(X_train, X_test)

    # Store the X and y train and test DataFrames in the dictionary as a tuple
    oversample_theme_dfs[theme_name] = (X_train_final, y_train, X_test_final, y_test)

    # Save each DataFrame as CSV in the respective theme subfolder
    X_train_file = os.path.join(theme_folder, 'X_train.csv')
    y_train_file = os.path.join(theme_folder, 'y_train.csv')
    X_test_file = os.path.join(theme_folder, 'X_test.csv')
    y_test_file = os.path.join(theme_folder, 'y_test.csv')

    X_train_final.to_csv(X_train_file, index=False)
    y_train.to_csv(y_train_file, index=False)
    X_test_final.to_csv(X_test_file, index=False)
    y_test.to_csv(y_test_file, index=False)

    print(f" - Saved feature datasets for theme '{theme_name}' to folder '{theme_folder}'.")

# Display the resulting dictionary
print("X and y train and test DataFrames stored for each theme:")
for theme, dfs in oversample_theme_dfs.items():
    print(f"{theme}: X_train shape: {dfs[0].shape}, y_train shape: {dfs[1].shape}, X_test shape: {dfs[2].shape}, y_test shape: {dfs[3].shape}")

In [None]:
import os 

# Initialize an empty dictionary to hold training and test DataFrames for each theme
undersample_theme_dfs = {}

# Define base directory where subfolders for each theme will be saved
base_directory = 'undersample_theme_features'

# List of themes to ensure subfolders exist for each one
theme_list = ['aspirational', 'familial', 'navigational', 'resistance', 'social']

# Ensure that the base directory exists, create if it doesn't
if not os.path.exists(base_directory):
    os.makedirs(base_directory)

# Loop through each theme and its corresponding merged dataset
for theme_name, merged_df in merged_dataset_dict.items():
    print(f"Processing theme: {theme_name}")

    # Ensure subfolder exists for the current theme
    theme_folder = os.path.join(base_directory, theme_name)
    if not os.path.exists(theme_folder):
        os.makedirs(theme_folder)

    # Call the prepare_data function and get the training and test DataFrames
    training_df, test_df = prepare_data(merged_df, test_size=0.1, resample=True, strategy='oversample', seed=18)

    # Print dataset shape information
    print(f"Training dataset shape: {training_df.shape} \nTest dataset shape: {test_df.shape}")

    # Count and print the number of positive labels in training and test sets
    pos_labels_train = training_df['label'].sum()
    print(f"Positive labels in training set: {pos_labels_train} out of {len(training_df)} ({(pos_labels_train / len(training_df)) * 100:.2f}%)")

    pos_labels_test = test_df['label'].sum()
    print(f"Positive labels in test set: {pos_labels_test} out of {len(test_df)} ({(pos_labels_test / len(test_df)) * 100:.2f}%)")

    # Feature engineering
    print(" - Feature engineering...")
    X_train, y_train = feature_engineering(training_df)
    X_test, y_test = feature_engineering(test_df)

    # Ensure that the labels are integers
    y_train = y_train.astype('int')
    y_test = y_test.astype('int')


    # Append unigrams (or perform other feature modifications)
    X_train_final, X_test_final = append_unigram(X_train, X_test)

    # Store the X and y train and test DataFrames in the dictionary as a tuple
    undersample_theme_dfs[theme_name] = (X_train_final, y_train, X_test_final, y_test)

    # Save each DataFrame as CSV in the respective theme subfolder
    X_train_file = os.path.join(theme_folder, 'X_train.csv')
    y_train_file = os.path.join(theme_folder, 'y_train.csv')
    X_test_file = os.path.join(theme_folder, 'X_test.csv')
    y_test_file = os.path.join(theme_folder, 'y_test.csv')

    X_train_final.to_csv(X_train_file, index=False)
    y_train.to_csv(y_train_file, index=False)
    X_test_final.to_csv(X_test_file, index=False)
    y_test.to_csv(y_test_file, index=False)

    print(f" - Saved feature datasets for theme '{theme_name}' to folder '{theme_folder}'.")

# Display the resulting dictionary
print("X and y train and test DataFrames stored for each theme:")
for theme, dfs in undersample_theme_dfs.items():
    print(f"{theme}: X_train shape: {dfs[0].shape}, y_train shape: {dfs[1].shape}, X_test shape: {dfs[2].shape}, y_test shape: {dfs[3].shape}")

In [24]:
def load_theme_features(theme_folder_base='theme_features', themes=['aspirational', 'familial', 'navigational', 'resistance', 'social']):
    """Load the saved CSVs for each theme from the theme_features folder.
    
    Parameters:
    theme_folder_base (str): The base folder containing the theme subfolders.
    themes (list): A list of theme names corresponding to the subfolder names.
    
    Returns:
    dict: A dictionary where the keys are the theme names, and the values are tuples containing 
          (X_train, y_train, X_test, y_test) DataFrames for each theme.
    """
    theme_dfs = {}

    for theme in themes:
        # Define the paths to the CSV files for the current theme
        theme_folder = os.path.join(theme_folder_base, theme)
        X_train_file = os.path.join(theme_folder, 'X_train.csv')
        y_train_file = os.path.join(theme_folder, 'y_train.csv')
        X_test_file = os.path.join(theme_folder, 'X_test.csv')
        y_test_file = os.path.join(theme_folder, 'y_test.csv')
        
        # Load the CSV files into DataFrames
        if os.path.exists(X_train_file) and os.path.exists(y_train_file) and os.path.exists(X_test_file) and os.path.exists(y_test_file):
            X_train = pd.read_csv(X_train_file)
            y_train = pd.read_csv(y_train_file)
            X_test = pd.read_csv(X_test_file)
            y_test = pd.read_csv(y_test_file)
            
            # Store them in the dictionary
            theme_dfs[theme] = (X_train, y_train, X_test, y_test)
            print(f"Loaded data for theme: {theme}")
        else:
            print(f"Missing files for theme: {theme}. Please check the folder structure.")
    
    return theme_dfs

# Usage Example:
theme_datasets = load_theme_features()



Loaded data for theme: aspirational
Loaded data for theme: familial
Loaded data for theme: navigational
Loaded data for theme: resistance
Loaded data for theme: social


In [27]:
theme_datasets["aspirational"][2]

Unnamed: 0,sentence,phrase,tokens,bio_sim_words,chem_sim_words,phy_sim_words,math_sim_words,tech_sim_words,eng_sim_words,medical_terms,...,sf,state,study,successful,things,thought,want,wanted,wasnt,work
0,"also another reason that i am here in this course is my major used to be math and physic when i was in my country and i did really good at them but as i moved here i change my major to biology to become optometric.after i done with my career my job should be related to math, physics,and biology.",['I moved here I change my major to biology to become optometric.'],"['also', 'another', 'reason', 'that', 'i', 'am', 'here', 'in', 'this', 'course', 'is', 'my', 'major', 'used', 'to', 'be', 'math', 'and', 'physic', 'when', 'i', 'was', 'in', 'my', 'country', 'and', 'i', 'did', 'really', 'good', 'at', 'them', 'but', 'as', 'i', 'moved', 'here', 'i', 'change', 'my', 'major', 'to', 'biology', 'to', 'become', 'optometric.after', 'i', 'done', 'with', 'my', 'career', 'my', 'job', 'should', 'be', 'related', 'to', 'math', ',', 'physics', ',', 'and', 'biology', '.']",5,5,1,5,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,i think i am here because i was born.,"['Well honestly, I am here because it is part of my pre req requirements to get into dental school.']","['i', 'think', 'i', 'am', 'here', 'because', 'i', 'was', 'born', '.']",0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,some of my goals are to become a brain surgeon or a researcher.,['I guess I want my life to be exciting and fun. Some of my goals are to become a brain surgeon or a researcher. I want to help people also live their best lives. I want to find cures to these diseases or at least prevent them.'],"['some', 'of', 'my', 'goals', 'are', 'to', 'become', 'a', 'brain', 'surgeon', 'or', 'a', 'researcher', '.']",0,0,0,0,0,0,1,...,0,0,0,0,0,0,0,0,0,0


In [24]:
X_test, y_test = feature_engineering(test_df)

In [25]:
X_test.shape

(179, 121)

In [26]:
y_test = y_test.astype('int')

**Calculate Unigram features for both train and test set**

In [27]:
X_train.shape

(1609, 121)

In [36]:
X_test.shape

(51, 121)

In [28]:
X_train.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/notebooks/experiments/exp_2.0/Aspirational/saved_features/X_train_final.csv", index=False)
X_test.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/notebooks/experiments/exp_2.1/Aspirational/saved_features/X_test_final.csv", index=False)
y_train.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/notebooks/experiments/exp_2.1/Aspirational/saved_features/y_train.csv", index=False)
y_test.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/notebooks/experiments/exp_2.1/Aspirational/saved_features/y_test.csv", index=False)


In [3]:
def append_unigram(X_TRAIN, X_TEST):
    unigram_matrix = unigram_vect.fit_transform(X_TRAIN['sentence'])
    unigrams = pd.DataFrame(unigram_matrix.toarray())
    unigrams = unigrams.reset_index(drop=True)
    print("Shape of the unigram df for train : ",unigrams.shape)
    X_train = pd.concat([X_TRAIN, unigrams_test], axis = 1)

    unigram_matrix_test = unigram_vect.transform(X_TEST['sentence'])
    unigrams_test = pd.DataFrame(unigram_matrix_test.toarray())
    unigrams_test = unigrams_test.reset_index(drop=True)
    print("Test unigram df shape : ",unigrams_test.shape)
    X_test = pd.concat([X_TEST, unigrams_test], axis = 1)

    return X_train, X_test

In [29]:
# Unigrams for training set
unigram_matrix = unigram_vect.fit_transform(X_train['sentence'])
unigrams = pd.DataFrame(unigram_matrix.toarray())
print("Shape of the unigram df for train : ",unigrams.shape)
unigrams = unigrams.reset_index(drop=True)

Shape of the unigram df for train :  (1609, 1188)


In [30]:
X_train_final = pd.concat([X_train, unigrams], axis = 1)

In [31]:
X_train_final.columns = X_train_final.columns.astype(str)

In [32]:
X_train_final.shape

(1609, 1309)

In [33]:
unigram_matrix_test = unigram_vect.transform(X_test['sentence'])
unigrams_test = pd.DataFrame(unigram_matrix_test.toarray())
unigrams_test = unigrams_test.reset_index(drop=True)
print("Test unigram df shape : ",unigrams_test.shape)

Test unigram df shape :  (179, 1188)


In [34]:
X_test_final = pd.concat([X_test, unigrams_test], axis = 1)
X_test_final.shape

(179, 1309)

In [35]:
X_test_final.columns = X_test_final.columns.astype(str)

In [36]:
X_test_final.shape

(179, 1309)

In [37]:
for i in range(0, len(X_train_final.columns)):
    print('{} ---- {}'.format(i, X_train_final.columns[i]))

0 ---- sentence
1 ---- phrase
2 ---- tokens
3 ---- bio_sim_words
4 ---- chem_sim_words
5 ---- phy_sim_words
6 ---- math_sim_words
7 ---- tech_sim_words
8 ---- eng_sim_words
9 ---- medical_terms
10 ---- polarity
11 ---- subjectivity
12 ---- ner
13 ---- interjections
14 ---- nouns
15 ---- adverb
16 ---- verb
17 ---- determiner
18 ---- pronoun
19 ---- adjetive
20 ---- preposition
21 ---- embedding0
22 ---- embedding1
23 ---- embedding2
24 ---- embedding3
25 ---- embedding4
26 ---- embedding5
27 ---- embedding6
28 ---- embedding7
29 ---- embedding8
30 ---- embedding9
31 ---- embedding10
32 ---- embedding11
33 ---- embedding12
34 ---- embedding13
35 ---- embedding14
36 ---- embedding15
37 ---- embedding16
38 ---- embedding17
39 ---- embedding18
40 ---- embedding19
41 ---- embedding20
42 ---- embedding21
43 ---- embedding22
44 ---- embedding23
45 ---- embedding24
46 ---- embedding25
47 ---- embedding26
48 ---- embedding27
49 ---- embedding28
50 ---- embedding29
51 ---- embedding30
52 ---- em

### LR Model 6: Without STEM Similarity

In [38]:
X_train_model_6 = X_train_final.iloc[:,np.r_[10:1308]]

In [39]:
X_train_model_6.shape

(1609, 1298)

In [40]:
X_test_model_6 = X_test_final.iloc[:,np.r_[10:1308]]

In [41]:
X_test_model_6.shape

(179, 1298)

In [42]:
model_6_pipeline = Pipeline([ 
                        ('clf', LogisticRegression(class_weight='balanced',random_state=18)),
                       ])

parameters = {
               'clf__C': [0.001,.009,0.01,.09,1,5,10,25],
               'clf__penalty' : ["l2"],
               'clf__solver': ['liblinear']
             }

grid_search = GridSearchCV(model_6_pipeline, parameters, scoring="average_precision", cv = 10, n_jobs=-1, verbose=1)

grid_search.fit(X_train_model_6,y_train)

print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    

print(classification_report(y_test, grid_search.best_estimator_.predict(X_test_model_6), digits=2))

Fitting 10 folds for each of 8 candidates, totalling 80 fits
Best score: 0.741
Best parameters set:
	clf__C: 0.09
	clf__penalty: 'l2'
	clf__solver: 'liblinear'
              precision    recall  f1-score   support

           0       0.78      0.79      0.78        90
           1       0.78      0.78      0.78        89

    accuracy                           0.78       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.78      0.78      0.78       179



In [43]:
from sklearn.metrics import classification_report, average_precision_score

In [44]:
lr_model_6 = LogisticRegression(random_state=18, solver=best_parameters['clf__solver'], 
                                C=best_parameters['clf__C'], 
                                penalty=best_parameters['clf__penalty'], class_weight='balanced').fit(X_train_model_6, y_train)
y_lr = lr_model_6.predict(X_test_model_6)
print('Logistic regression Classifier')
tn, fp, fn, tp = confusion_matrix(y_test, y_lr).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))
print('-' * 80)
print(confusion_matrix(y_test, y_lr))
print('-' * 80)
print(classification_report(y_test, y_lr))

# Calculate and print the average precision score
avg_precision = average_precision_score(y_test, y_lr)
print(f'Average Precision: {avg_precision:.4f}')

Logistic regression Classifier
True Negative: 71, False Positive: 19, False Negative: 20, True Positive: 69
--------------------------------------------------------------------------------
[[71 19]
 [20 69]]
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.78      0.79      0.78        90
           1       0.78      0.78      0.78        89

    accuracy                           0.78       179
   macro avg       0.78      0.78      0.78       179
weighted avg       0.78      0.78      0.78       179

Average Precision: 0.7196


### RF Model 4: Without Embeddings

In [45]:
n_estimators = [10,20,50,100,200,300]
max_depth = [2,5,8,10,15,20]
min_samples_split = [2, 5, 10, 15, 20]
min_samples_leaf = [1, 2, 5, 10,20]
rf_parameters = dict(n_estimators = n_estimators, max_depth = max_depth,  
              min_samples_split = min_samples_split, 
              min_samples_leaf = min_samples_leaf)

## reduced parameters
# n_estimators = [50, 100, 200]       # Reduced options
# max_depth = [5, 10, 15]             # Reduced options
# min_samples_split = [2, 10]         # Reduced options
# min_samples_leaf = [1, 5]           # Reduced options

# rf_parameters = dict(
#     n_estimators = n_estimators, 
#     max_depth = max_depth,  
#     min_samples_split = min_samples_split, 
#     min_samples_leaf = min_samples_leaf
# )


In [46]:
X_train_model_4 = X_train_final.iloc[:,np.r_[3:21,121:1308]]

In [47]:
X_train_model_4.shape

(1609, 1205)

In [48]:
X_test_model_4 = X_test_final.iloc[:,np.r_[3:21,121:1308]]

In [49]:
X_test_model_4.shape

(179, 1205)

In [50]:
rf_model_4 = RandomForestClassifier(random_state=18,class_weight='balanced')
grid_search = GridSearchCV(rf_model_4, rf_parameters, scoring="average_precision", cv = 10, n_jobs=-1, verbose=1)
grid_search.fit(X_train_model_4,y_train)
print("Best score: %0.3f" % grid_search.best_score_)
print("Best parameters set:")
best_parameters = grid_search.best_estimator_.get_params()

for param_name in sorted(rf_parameters.keys()):
    print("\t%s: %r" % (param_name, best_parameters[param_name]))
    

print(classification_report(y_test, grid_search.best_estimator_.predict(X_test_model_4), digits=2))

Fitting 10 folds for each of 900 candidates, totalling 9000 fits
Best score: 0.736
Best parameters set:
	max_depth: 20
	min_samples_leaf: 2
	min_samples_split: 2
	n_estimators: 200
              precision    recall  f1-score   support

           0       0.79      0.79      0.79        90
           1       0.79      0.79      0.79        89

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.79      0.79      0.79       179



In [51]:
randomForest_4 = RandomForestClassifier(random_state=18,
                                        class_weight=best_parameters['class_weight'],
                                        max_depth=best_parameters['max_depth'],
                                        min_samples_leaf=best_parameters['min_samples_leaf'],
                                        min_samples_split=best_parameters['min_samples_split'],
                                        n_estimators=best_parameters['n_estimators']).fit(X_train_model_4, y_train)

y_lr = randomForest_4.predict(X_test_model_4)
print('Logistic regression Classifier')
tn, fp, fn, tp = confusion_matrix(y_test, y_lr).ravel()
print('True Negative: {}, False Positive: {}, False Negative: {}, True Positive: {}'.format(tn, fp, fn, tp))
print('-' * 80)
print(confusion_matrix(y_test, y_lr))
print('-' * 80)
print(classification_report(y_test, y_lr))

# Calculate and print the average precision score
avg_precision = average_precision_score(y_test, y_lr)
print(f'Average Precision: {avg_precision:.4f}')

Logistic regression Classifier
True Negative: 71, False Positive: 19, False Negative: 19, True Positive: 70
--------------------------------------------------------------------------------
[[71 19]
 [19 70]]
--------------------------------------------------------------------------------
              precision    recall  f1-score   support

           0       0.79      0.79      0.79        90
           1       0.79      0.79      0.79        89

    accuracy                           0.79       179
   macro avg       0.79      0.79      0.79       179
weighted avg       0.79      0.79      0.79       179

Average Precision: 0.7248
