## Stadaridization of Datasets

1. Merge batch 1 and batch 2.
2. Create gold standards of single-theme datasets.
3. Create gold standards of merge-theme datasets.

In [3]:
import pandas as pd
import os

In [38]:
# Define file paths for Batch 1 and Batch 2 directories
batch_1_dir = "/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method"
batch_2_dir = "/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/batch_2"

# List of theme files in Batch 1
batch_1_files = [f for f in os.listdir(batch_1_dir) if f.endswith('.csv')]
batch_2_files = [f for f in os.listdir(batch_2_dir) if f.endswith('.csv')]
# Dictionary to store merged DataFrames per theme
merged_themes = {}

In [39]:
batch_1_files

['Perseverance_sentence_level_batch_1_jaccard.csv',
 'Spiritual_sentence_level_batch_1_jaccard.csv',
 'Community Consciousness_sentence_level_batch_1_jaccard.csv',
 'Familial_sentence_level_batch_1_jaccard.csv',
 'Social_sentence_level_batch_1_jaccard.csv',
 'Attainment_sentence_level_batch_1_jaccard.csv',
 'Filial Piety_sentence_level_batch_1_jaccard.csv',
 'Aspirational_sentence_level_batch_1_jaccard.csv',
 'First Generation_sentence_level_batch_1_jaccard.csv',
 'Navigational_sentence_level_batch_1_jaccard.csv',
 'Resistance_sentence_level_batch_1_jaccard.csv']

In [40]:
batch_2_files

['Filial Piety_sentence_level_batch_2_jaccard.csv',
 'Attainment_sentence_level_batch_2_jaccard.csv',
 'Social_sentence_level_batch_2_jaccard.csv',
 'Familial_sentence_level_batch_2_jaccard.csv',
 'Community Consciousness_sentence_level_batch_2_jaccard.csv',
 'Spiritual_sentence_level_batch_2_jaccard.csv',
 'Perseverance_sentence_level_batch_2_jaccard.csv',
 'Resistance_sentence_level_batch_2_jaccard.csv',
 'Navigational_sentence_level_batch_2_jaccard.csv',
 'First Generation_sentence_level_batch_2_jaccard.csv',
 'Aspirational_sentence_level_batch_2_jaccard.csv']

In [62]:
def search_substring_in_column(df, column_name, substring):
    """
    Search for rows in a DataFrame where a specified column contains a given substring.

    Args:
        df (pd.DataFrame): The DataFrame to search.
        column_name (str): The name of the column to search within.
        substring (str): The substring to search for.

    Returns:
        pd.DataFrame: A DataFrame containing rows where the column contains the substring.
    """
    # Use str.contains() to search for the substring
    matching_rows = df[df[column_name].str.contains(substring, case=False, na=False)]
    return matching_rows


In [65]:
# Function to merge files by theme
def merge_files_by_theme(batch_1_files, batch_2_files, batch_1_dir, batch_2_dir):
    merged_themes = {}
    
    for file_1 in batch_1_files:
        # Extract theme name from Batch 1 file
        theme_name = file_1.replace('_sentence_level_batch_1_jaccard.csv', '')
        
        # Locate the corresponding Batch 2 file
        file_2 = f"{theme_name}_sentence_level_batch_2_jaccard.csv"
        
        # Paths to the files
        batch_1_path = os.path.join(batch_1_dir, file_1)
        batch_2_path = os.path.join(batch_2_dir, file_2)
        
        # Load data from Batch 1 and Batch 2
        batch_1_data = pd.read_csv(batch_1_path)
        batch_2_data = pd.read_csv(batch_2_path)
        
        # Add a 'batch' column to each dataset
        batch_1_data['batch'] = 1
        batch_2_data['batch'] = 2
        
        # Merge the two datasets
        merged_data = pd.concat([batch_1_data, batch_2_data], ignore_index=True)
        
        # Store merged data in dictionary
        merged_themes[theme_name] = merged_data
    
    return merged_themes

# Function to check for duplicates in a single DataFrame
def find_duplicates(data, column_name='sentence'):
    """
    Check for duplicates in a DataFrame based on a specific column.

    Args:
        data (pd.DataFrame): The DataFrame to check for duplicates.
        column_name (str): The name of the column to check for duplicates.

    Returns:
        pd.DataFrame: DataFrame containing duplicate rows based on the specified column.
    """
    duplicate_rows = data[data.duplicated(subset=[column_name], keep=False)]
    return duplicate_rows

duplicates = {}

for theme, data in merged_themes.items():
    duplicate_rows = find_duplicates(data)
    if not duplicate_rows.empty:
        duplicates[theme] = duplicate_rows

# Merge files and find duplicates
merged_themes = merge_files_by_theme(batch_1_files, batch_2_files, batch_1_dir, batch_2_dir)

# Print themes with duplicates and the number of duplicates
for theme, dup_data in duplicates.items():
    print(f"Theme: {theme}, Number of duplicates: {len(dup_data)}")

Theme: Perseverance, Number of duplicates: 91
Theme: Spiritual, Number of duplicates: 152
Theme: Community Consciousness, Number of duplicates: 36
Theme: Familial, Number of duplicates: 76
Theme: Social, Number of duplicates: 88
Theme: Attainment, Number of duplicates: 651
Theme: Filial Piety, Number of duplicates: 142
Theme: Aspirational, Number of duplicates: 1064
Theme: First Generation, Number of duplicates: 46
Theme: Navigational, Number of duplicates: 569
Theme: Resistance, Number of duplicates: 9


In [42]:
# Function to remove duplicates but keep the first occurrence
def remove_duplicates_keep_first(merged_themes):
    cleaned_themes = {}
    
    for theme, data in merged_themes.items():
        # Drop duplicates but keep the first occurrence
        unique_data = data.drop_duplicates(keep='first')
        cleaned_themes[theme] = unique_data
    
    return cleaned_themes

cleaned_themes = remove_duplicates_keep_first(merged_themes)

In [44]:
merged_themes["Familial"]

Unnamed: 0,sentence,label,phrase,batch
0,i'm here because my parents came here during t...,0,"[""I'm here because my parents came here during...",1
1,i mean my mother wouldn't have been on a boat ...,0,"[""I'm here because my parents came here during...",1
2,she made a lot of sacrifices for me to be here.,0,"[""I'm here because my parents came here during...",1
3,i guess you can say the same for my fathers sa...,0,"[""I'm here because my parents came here during...",1
4,i am also here because the formula to being su...,0,"[""I'm here because my parents came here during...",1
...,...,...,...,...
1428,i am taking these courses in order to graduate...,0,['I am here because I want to be the first one...,2
1429,i am here because i want to be the first one o...,0,['I am here because I want to be the first one...,2
1430,i sometimes have to force myself to get to sch...,0,['I am here because I want to be the first one...,2
1431,i am here with a full schedule because i hope ...,0,['I am here because I want to be the first one...,2


In [36]:
cleaned_themes["Attainment"]

Unnamed: 0,sentence,label,phrase,batch
0,why am i here?,0,"[""Ever since I was little I wanted to be a doc...",1
1,well why does anyone pursue a higher education?,0,"[""Ever since I was little I wanted to be a doc...",1
2,to better one self and be able to succeed late...,0,"[""Ever since I was little I wanted to be a doc...",1
3,ever since i was little i wanted to be a docto...,1,"[""Ever since I was little I wanted to be a doc...",1
4,i always wanted to be able to help people and ...,0,"[""Ever since I was little I wanted to be a doc...",1
...,...,...,...,...
4608,physics will be very helpful in reallife situa...,0,['After receiving my bachelors degree I hope t...,2
4609,the physics lab specifically is showing me iso...,0,['After receiving my bachelors degree I hope t...,2
4610,an example of me using this once i become a do...,0,['After receiving my bachelors degree I hope t...,2
4611,if i understand the mechanics of the physics b...,0,['After receiving my bachelors degree I hope t...,2


## Corrections

In [46]:
familial_plus_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Familial/Familial_Merged_evaluated.csv")
navigational_plus_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Navigational/Navigational_merged.csv")


In [48]:
merged_themes["Familial"]

Unnamed: 0,sentence,label,phrase,batch
0,i'm here because my parents came here during t...,0,"[""I'm here because my parents came here during...",1
1,i mean my mother wouldn't have been on a boat ...,0,"[""I'm here because my parents came here during...",1
2,she made a lot of sacrifices for me to be here.,0,"[""I'm here because my parents came here during...",1
3,i guess you can say the same for my fathers sa...,0,"[""I'm here because my parents came here during...",1
4,i am also here because the formula to being su...,0,"[""I'm here because my parents came here during...",1
...,...,...,...,...
1428,i am taking these courses in order to graduate...,0,['I am here because I want to be the first one...,2
1429,i am here because i want to be the first one o...,0,['I am here because I want to be the first one...,2
1430,i sometimes have to force myself to get to sch...,0,['I am here because I want to be the first one...,2
1431,i am here with a full schedule because i hope ...,0,['I am here because I want to be the first one...,2


In [47]:
familial_plus_df

Unnamed: 0,sentence,label,phrase
0,after i graduate i don't know what i really wa...,0,"[""I'm just going through the requirements to g..."
1,i am here to make my mother proud and also to ...,1,['I am here to make my mother proud and also t...
2,but sometimes i just feel like changing my maj...,0,"[""If it wasn't for my family specifically my s..."
3,"it wasnt easy, since i have always had a hard ...",0,['Since no one in my family ever lived the col...
4,although there are some aspects i still find c...,0,['I almost attended a city college after gradu...
...,...,...,...
2105,i took the jc route; college tuition was too e...,0,['my parents came to this country to give thei...
2106,i genuinely want to become a great mechanical ...,0,"[""When I was younger, the desire to go into th..."
2107,"in order to become a licensed psychologist, i ...",0,"['As for my mother, I want her to see that all..."
2108,i also think that all college stuff gets you p...,0,['Im also here for my family that have helped ...


In [70]:
temp_fam_df = pd.concat([familial_plus_df, merged_themes["Familial"]])
temp_fil_df = pd.concat([familial_plus_df, merged_themes["Filial Piety"]])
temp_first_gen_df = pd.concat([familial_plus_df, merged_themes["First Generation"]])

fam_dups = find_duplicates(temp_fam_df)
fil_dups = find_duplicates(temp_fil_df)
fg_dups = find_duplicates(temp_first_gen_df)


In [64]:
# Example usage
# Assuming 'sentence' is the column name
substring_to_search = "after i graduate i don't know what i really want to do."
matching_rows = search_substring_in_column(temp_fam_df, 'sentence', substring_to_search)

# Display results
matching_rows

Unnamed: 0,sentence,label,phrase,batch
0,after i graduate i don't know what i really wa...,0,"[""I'm just going through the requirements to g...",


In [80]:
merged_themes["Familial"].shape
merged_themes["Filial Piety"].shape
merged_themes["First Generation"].shape

(471, 4)

In [78]:
fam_correction = temp_fam_df.drop_duplicates(subset="sentence", keep="first")
fam_correction

Unnamed: 0,sentence,label,phrase,batch
0,after i graduate i don't know what i really wa...,0,"[""I'm just going through the requirements to g...",
1,i am here to make my mother proud and also to ...,1,['I am here to make my mother proud and also t...,
2,but sometimes i just feel like changing my maj...,0,"[""If it wasn't for my family specifically my s...",
3,"it wasnt easy, since i have always had a hard ...",0,['Since no one in my family ever lived the col...,
4,although there are some aspects i still find c...,0,['I almost attended a city college after gradu...,
...,...,...,...,...
1241,i am at sf state because i want to take advant...,0,"['My parents were not able to go to college, s...",2.0
1242,"my parents were not able to go to college, so ...",0,"['My parents were not able to go to college, s...",2.0
1243,my family has motivated me to do well in schoo...,0,"['My parents were not able to go to college, s...",2.0
1244,i also want to represent my latino community i...,0,"['My parents were not able to go to college, s...",2.0


In [72]:
fil_dups

Unnamed: 0,sentence,label,phrase,batch
0,after i graduate i don't know what i really wa...,0,"[""I'm just going through the requirements to g...",
1,i am here to make my mother proud and also to ...,1,['I am here to make my mother proud and also t...,
2,but sometimes i just feel like changing my maj...,0,"[""If it wasn't for my family specifically my s...",
5,i want to be remembered by everyone i am hopef...,0,['I want to also be able to set up my family f...,
7,i have always had a passion for hands on work ...,0,['In todays society most individuals want to s...,
...,...,...,...,...
1454,i am here to enjoy life as well and learn as m...,0,['I am here to excel in school and make my par...,2.0
1455,i am here to learn from my peers and teachers ...,0,['I am here to excel in school and make my par...,2.0
1456,i feel like i am also here to teach others my ...,0,['I am here to excel in school and make my par...,2.0
1457,i believe that everyone in the world can learn...,0,['I am here to excel in school and make my par...,2.0


In [73]:
fg_dups

Unnamed: 0,sentence,label,phrase,batch
3,"it wasnt easy, since i have always had a hard ...",0,['Since no one in my family ever lived the col...,
14,i am here because of my family.,1,['I am here because of my family. I am the fir...,
15,why am i here?there's a lot of reasons as to w...,0,"[""Another reason is to why I'm here is because...",
22,being able to do good in this class gives me t...,0,['As a firstgeneration college student it has ...,
27,i am here because i fell in love with the worl...,0,['I am here because I want to further advance ...,
...,...,...,...,...
459,i am here to hopefully pass this class and com...,0,['I wouldnt be here without my parents who sac...,2.0
460,i am here to get a degree to go on and become ...,0,['I wouldnt be here without my parents who sac...,2.0
461,i wouldnt be here without my parents who sacri...,0,['I wouldnt be here without my parents who sac...,2.0
462,i am part of the first generation in my family...,1,['I wouldnt be here without my parents who sac...,2.0


In [61]:
familial_plus_df

Unnamed: 0,sentence,label,phrase
0,after i graduate i don't know what i really wa...,0,"[""I'm just going through the requirements to g..."
1,i am here to make my mother proud and also to ...,1,['I am here to make my mother proud and also t...
2,but sometimes i just feel like changing my maj...,0,"[""If it wasn't for my family specifically my s..."
3,"it wasnt easy, since i have always had a hard ...",0,['Since no one in my family ever lived the col...
4,although there are some aspects i still find c...,0,['I almost attended a city college after gradu...
...,...,...,...
2105,i took the jc route; college tuition was too e...,0,['my parents came to this country to give thei...
2106,i genuinely want to become a great mechanical ...,0,"[""When I was younger, the desire to go into th..."
2107,"in order to become a licensed psychologist, i ...",0,"['As for my mother, I want her to see that all..."
2108,i also think that all college stuff gets you p...,0,['Im also here for my family that have helped ...


In [86]:
def update_labels(single_file_df, corrected_data, key_column='sentence', label_column='label'):
    """
    Update the labels of a single theme file using the corrected data.

    Args:
        single_file_path (str): Path to the single theme file.
        corrected_data (pd.DataFrame): DataFrame containing corrected labels.
        key_column (str): The column to match on (e.g., 'sentence').
        label_column (str): The column containing the labels.

    Returns:
        pd.DataFrame: Updated DataFrame with corrected labels.
    """
    # Load the single theme file
    # single_data = pd.read_csv(single_file_path)
    single_data = single_file_df
    
    # Merge with corrected data, prioritizing corrected labels
    merged_data = pd.concat([corrected_data, single_data], ignore_index=True)
    
    # Drop duplicates, keeping the first occurrence (corrected version)
    updated_data = merged_data.drop_duplicates(subset=[key_column], keep='first')
    
    return updated_data

In [87]:
updated_familial = update_labels(merged_themes["Familial"], familial_plus_df)
updated_familial

Unnamed: 0,sentence,label,phrase,batch
0,after i graduate i don't know what i really wa...,0,"[""I'm just going through the requirements to g...",
1,i am here to make my mother proud and also to ...,1,['I am here to make my mother proud and also t...,
2,but sometimes i just feel like changing my maj...,0,"[""If it wasn't for my family specifically my s...",
3,"it wasnt easy, since i have always had a hard ...",0,['Since no one in my family ever lived the col...,
4,although there are some aspects i still find c...,0,['I almost attended a city college after gradu...,
...,...,...,...,...
3351,i am at sf state because i want to take advant...,0,"['My parents were not able to go to college, s...",2.0
3352,"my parents were not able to go to college, so ...",0,"['My parents were not able to go to college, s...",2.0
3353,my family has motivated me to do well in schoo...,0,"['My parents were not able to go to college, s...",2.0
3354,i also want to represent my latino community i...,0,"['My parents were not able to go to college, s...",2.0


In [98]:
def update_labels_exact(single_data, corrected_data, key_column='sentence', label_column='label'):
    """
    Update the labels of a single theme DataFrame using the corrected data,
    ensuring the row count matches the original.

    Args:
        single_data (pd.DataFrame): DataFrame containing the single theme data.
        corrected_data (pd.DataFrame): DataFrame containing corrected labels.
        key_column (str): The column to match on (e.g., 'sentence').
        label_column (str): The column containing the labels.

    Returns:
        pd.DataFrame: Updated DataFrame with the same row count as the original.
    """
    # Merge the single theme data with corrected data to update labels
    updated_data = single_data.merge(
        corrected_data[[key_column, label_column]],
        on=key_column,
        how='left',  # Keep only rows in the original data
        suffixes=('', '_corrected')
    )
    
    # Use the corrected label where available, fallback to original
    updated_data[label_column] = updated_data[f"{label_column}_corrected"].fillna(updated_data[label_column])
    
    # Drop the temporary corrected column
    updated_data.drop(columns=[f"{label_column}_corrected"], inplace=True)
    
    # Convert labels to integers
    updated_data[label_column] = updated_data[label_column].astype(int)
    
    return updated_data

In [99]:
updated_familial = update_labels_exact(merged_themes["Familial"], familial_plus_df)
updated_familial.label.value_counts()

label
0    1148
1     285
Name: count, dtype: int64

In [100]:
search_substring_in_column(familial_plus_df, "sentence", "i'm here because my parents came")

Unnamed: 0,sentence,label,phrase
1855,i'm here because my parents came here during t...,1,"[""I'm here because my parents came here during..."


In [103]:
navigational

Unnamed: 0,sentence,label,phrase,batch
0,i'm here because my parents came here during t...,0,"[""I'm here because my parents came here during...",1
1,i mean my mother wouldn't have been on a boat ...,0,"[""I'm here because my parents came here during...",1
2,she made a lot of sacrifices for me to be here.,0,"[""I'm here because my parents came here during...",1
3,i guess you can say the same for my fathers sa...,0,"[""I'm here because my parents came here during...",1
4,i am also here because the formula to being su...,0,"[""I'm here because my parents came here during...",1
...,...,...,...,...
1428,i am taking these courses in order to graduate...,0,['I am here because I want to be the first one...,2
1429,i am here because i want to be the first one o...,0,['I am here because I want to be the first one...,2
1430,i sometimes have to force myself to get to sch...,0,['I am here because I want to be the first one...,2
1431,i am here with a full schedule because i hope ...,0,['I am here because I want to be the first one...,2


In [None]:
updated_familial = update_labels_exact(merged_themes["Familial"], familial_plus_df)
updated_familial.label.value_counts()

In [104]:
updated_filial = update_labels_exact(merged_themes["Filial Piety"], familial_plus_df)
updated_filial.label.value_counts()

label
0    1136
1     323
Name: count, dtype: int64

In [105]:
updated_filial

Unnamed: 0,sentence,label,phrase,batch
0,i'm here because my parents came here during t...,1,['She made a lot of sacrifices for me to be he...,1
1,i mean my mother wouldn't have been on a boat ...,0,['She made a lot of sacrifices for me to be he...,1
2,she made a lot of sacrifices for me to be here.,1,['She made a lot of sacrifices for me to be he...,1
3,i guess you can say the same for my fathers sa...,1,['She made a lot of sacrifices for me to be he...,1
4,i am also here because the formula to being su...,0,['She made a lot of sacrifices for me to be he...,1
...,...,...,...,...
1454,i am here to enjoy life as well and learn as m...,0,['I am here to excel in school and make my par...,2
1455,i am here to learn from my peers and teachers ...,0,['I am here to excel in school and make my par...,2
1456,i feel like i am also here to teach others my ...,0,['I am here to excel in school and make my par...,2
1457,i believe that everyone in the world can learn...,0,['I am here to excel in school and make my par...,2


In [106]:
updated_fg = update_labels_exact(merged_themes["First Generation"], familial_plus_df)
updated_fg.label.value_counts()

label
0    349
1    122
Name: count, dtype: int64

In [107]:
updated_fg

Unnamed: 0,sentence,label,phrase,batch
0,i am here at sfsu because i am a first generat...,1,['I am here at SFSU because I am a first gener...,1
1,school has always helped me escape the hard th...,0,['I am here at SFSU because I am a first gener...,1
2,"as the oldest of 3 siblings, i want to be a ro...",1,['I am here at SFSU because I am a first gener...,1
3,i have always valued my education and i think ...,0,['I am here at SFSU because I am a first gener...,1
4,i want to help my community while i'm here and...,0,['I am here at SFSU because I am a first gener...,1
...,...,...,...,...
466,i am also here in college because i wanted to ...,0,"['', 'I am also here in college because I want...",2
467,a lot of my family members did not go to colle...,0,"['', 'I am also here in college because I want...",2
468,i wanted to be that person for them so i moved...,0,"['', 'I am also here in college because I want...",2
469,i am the first one in my family to move out of...,0,"['', 'I am also here in college because I want...",2


In [112]:
navigational_plus_df.updated_label.value_counts()

updated_label
0    6900
1    1302
Name: count, dtype: int64

In [113]:
merged_themes["Navigational"].label.value_counts()

label
0    6900
1    1302
Name: count, dtype: int64

In [111]:
merged_themes["Navigational"]

Unnamed: 0,sentence,phrase,label,batch
0,i am here because i want to better myself my f...,['Being in this instituion will pave a way for...,0,1
1,being in this instituion will pave a way for m...,['Being in this instituion will pave a way for...,1,1
2,i know that as a child i never thought of educ...,['Being in this instituion will pave a way for...,0,1
3,i hope to reach a position in which i can insp...,['Being in this instituion will pave a way for...,0,1
4,"also, statistics have shown that people with h...",['Being in this instituion will pave a way for...,0,1
...,...,...,...,...
8197,physics will be very helpful in reallife situa...,['My main reason for taking this class is to p...,0,2
8198,the physics lab specifically is showing me iso...,['My main reason for taking this class is to p...,0,2
8199,an example of me using this once i become a do...,['My main reason for taking this class is to p...,0,2
8200,if i understand the mechanics of the physics b...,['My main reason for taking this class is to p...,0,2


In [115]:
social_plus_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Social/Social_merged_Reevaluated_3.csv")

social_plus_df
print(social_plus_df.label.value_counts())

label
0    2552
1     408
Name: count, dtype: int64


In [117]:
updated_social = update_labels_exact(merged_themes["Social"], social_plus_df)
updated_social.label.value_counts()

label
0    1487
1     233
Name: count, dtype: int64

In [118]:
updated_community = update_labels_exact(merged_themes["Community Consciousness"], social_plus_df)
updated_community.label.value_counts()

label
0    275
1     74
Name: count, dtype: int64

In [119]:
updated_spiritual = update_labels_exact(merged_themes["Spiritual"], social_plus_df)
updated_spiritual.label.value_counts()

label
0    1413
1     199
Name: count, dtype: int64

In [124]:
updated_social
updated_social.loc[0, 'label'] = 0  
updated_social.loc[4, 'label'] = 1

In [125]:
updated_social

Unnamed: 0,sentence,label,phrase,batch
0,i am here to help myself grow.,0,['Meet people who have (similar) aspirationsgo...,1
1,learn and apply my skills to progress towards ...,0,['Meet people who have (similar) aspirationsgo...,1
2,understand my purpose.,0,['Meet people who have (similar) aspirationsgo...,1
3,meet people who have (similar) aspirationsgoal...,1,['Meet people who have (similar) aspirationsgo...,1
4,to learn more about the natural world and buil...,1,['I am here to learn and build relationships w...,1
...,...,...,...,...
1715,although the class has been proving to be a li...,0,['I will also seek help from my peers as they ...,2
1716,i have heard from previous students that this ...,1,['I will also seek help from my peers as they ...,2
1717,the sci class will be a great help this semest...,0,['I will also seek help from my peers as they ...,2
1718,i will also seek help from my peers as they ma...,0,['I will also seek help from my peers as they ...,2


In [122]:
updated_community

Unnamed: 0,sentence,label,phrase,batch
0,i am here because i want to better myself my f...,0,['I Hope to reach a position in which I can in...,1
1,being in this instituion will pave a way for m...,0,['I Hope to reach a position in which I can in...,1
2,i know that as a child i never thought of educ...,0,['I Hope to reach a position in which I can in...,1
3,i hope to reach a position in which i can insp...,1,['I Hope to reach a position in which I can in...,1
4,"also, statistics have shown that people with h...",0,['I Hope to reach a position in which I can in...,1
...,...,...,...,...
344,i want to be in the medical field for animal c...,0,['I want to make a positive difference in my c...,2
345,i want to make a positive difference in my com...,1,['I want to make a positive difference in my c...,2
346,i am here in this class to travel my path to t...,0,['I want to make a positive difference in my c...,2
347,i know i am in this class for a couple of reas...,0,['I want to make a positive difference in my c...,2


In [123]:
updated_spiritual

Unnamed: 0,sentence,label,phrase,batch
0,why am i here?,0,['I always wanted to be able to help people'],1
1,well why does anyone pursue a higher education?,0,['I always wanted to be able to help people'],1
2,to better one self and be able to succeed late...,0,['I always wanted to be able to help people'],1
3,ever since i was little i wanted to be a docto...,0,['I always wanted to be able to help people'],1
4,i always wanted to be able to help people and ...,1,['I always wanted to be able to help people'],1
...,...,...,...,...
1607,"when i was younger, my mother was diagnosed wi...",0,"[""It then narrows down to the things that I fi...",2
1608,after experiencing those years of watching her...,0,"[""It then narrows down to the things that I fi...",2
1609,"also, i've been granted the gift of imaginatio...",0,"[""It then narrows down to the things that I fi...",2
1610,"i truly believe my huge personality, when comb...",0,"[""It then narrows down to the things that I fi...",2


In [126]:
resistance_plus_df = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Resistance/Resistance_Plus_Merged.csv")
resistance_plus_df.label.value_counts()

label
0    2067
1     131
Name: count, dtype: int64

In [133]:
attainment_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Attainment/Attainment_sentence_level_batch_1_jaccard_reevaluated.csv", usecols=["sentence", "label", "phrase"])
attainment_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Attainment/Attainment_sentence_level_batch_2_jaccard_reevaluated.csv", usecols=["sentence", "label", "phrase"])

aspirational_df_batch_1 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Aspirational/Aspirational_sentence_level_batch_1_jaccard_reevaluated.csv", usecols=["sentence", "label", "phrase"])
aspirational_df_batch_2 = pd.read_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/corrections/Aspirational/Aspirational_sentence_level_batch_2_jaccard_reevaluated.csv", usecols=["sentence", "label", "phrase"])

attainment_df = pd.concat([attainment_df_batch_1, attainment_df_batch_2])
aspirational_df = pd.concat([aspirational_df_batch_1, aspirational_df_batch_2])



In [134]:
attainment_df

Unnamed: 0,sentence,label,phrase
0,why am i here?,0,"[""Ever since I was little I wanted to be a doc..."
1,well why does anyone pursue a higher education?,0,"[""Ever since I was little I wanted to be a doc..."
2,to better one self and be able to succeed late...,0,"[""Ever since I was little I wanted to be a doc..."
3,ever since i was little i wanted to be a docto...,1,"[""Ever since I was little I wanted to be a doc..."
4,i always wanted to be able to help people and ...,0,"[""Ever since I was little I wanted to be a doc..."
...,...,...,...
2940,physics will be very helpful in reallife situa...,0,['After receiving my bachelors degree I hope t...
2941,the physics lab specifically is showing me iso...,0,['After receiving my bachelors degree I hope t...
2942,an example of me using this once i become a do...,1,['After receiving my bachelors degree I hope t...
2943,if i understand the mechanics of the physics b...,0,['After receiving my bachelors degree I hope t...


In [129]:
merged_themes["Resistance"].label.value_counts()

label
0    287
1     22
Name: count, dtype: int64

In [130]:
updated_resistance = update_labels_exact(merged_themes["Resistance"], resistance_plus_df)
updated_resistance.label.value_counts()

label
0    471
1     25
Name: count, dtype: int64

In [131]:
updated_pers = update_labels_exact(merged_themes["Perseverance"], resistance_plus_df)
updated_pers.label.value_counts()

label
0    2340
1     112
Name: count, dtype: int64

In [132]:
aspirational_batch_1

Unnamed: 0,sentence,label,phrase,batch
0,i am here at sfsu because i am a first generat...,0,['School has always helped me escape the hard ...,1
1,school has always helped me escape the hard th...,1,['School has always helped me escape the hard ...,1
2,"as the oldest of 3 siblings, i want to be a ro...",0,['School has always helped me escape the hard ...,1
3,i have always valued my education and i think ...,0,['School has always helped me escape the hard ...,1
4,i want to help my community while i'm here and...,0,['School has always helped me escape the hard ...,1
...,...,...,...,...
2447,it sounds clich but it really is the truth.,0,['They taught me the value of hard work and pe...,2
2448,it isn't just due to the fact that they're hel...,0,['They taught me the value of hard work and pe...,2
2449,they taught me the value of hard work and pers...,1,['They taught me the value of hard work and pe...,2
2450,"that is the reason as to ""why i am here""...",0,['They taught me the value of hard work and pe...,2


In [135]:
updated_familial.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/Familial_batch_1_batch_2_gold_standard.csv", index=False)
updated_fg.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/First_Generation_batch_1_batch_2_gold_standard.csv", index=False)
updated_filial.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/Filial_Piety_batch_1_batch_2_gold_standard.csv", index=False)
navigational_plus_df.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/Navigational_batch_1_batch_2_gold_standard.csv", index=False)
updated_social.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/Social_batch_1_batch_2_gold_standard.csv", index=False)
updated_community.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/Community_Consciousness_batch_1_batch_2_gold_standard.csv", index=False)
updated_spiritual.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/Spiritual_batch_1_batch_2_gold_standard.csv", index=False)
updated_resistance.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/Resistance_batch_1_batch_2_gold_standard.csv", index=False)
updated_pers.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/Perseverance_batch_1_batch_2_gold_standard.csv", index=False)
attainment_df.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/Attainment_batch_1_batch_2_gold_standard.csv", index=False)
aspirational_df.to_csv("/Users/gbaldonado/Developer/ml-alma-taccti/ml-alma-taccti/data/processed_for_model/single_theme_using_jaccard_method/gold_standard/Aspirational_batch_1_batch_2_gold_standard.csv", index=False)

In [139]:
updated_social

Unnamed: 0,sentence,label,phrase,batch
0,i am here to help myself grow.,0,['Meet people who have (similar) aspirationsgo...,1
1,learn and apply my skills to progress towards ...,0,['Meet people who have (similar) aspirationsgo...,1
2,understand my purpose.,0,['Meet people who have (similar) aspirationsgo...,1
3,meet people who have (similar) aspirationsgoal...,1,['Meet people who have (similar) aspirationsgo...,1
4,to learn more about the natural world and buil...,1,['I am here to learn and build relationships w...,1
...,...,...,...,...
1715,although the class has been proving to be a li...,0,['I will also seek help from my peers as they ...,2
1716,i have heard from previous students that this ...,1,['I will also seek help from my peers as they ...,2
1717,the sci class will be a great help this semest...,0,['I will also seek help from my peers as they ...,2
1718,i will also seek help from my peers as they ma...,0,['I will also seek help from my peers as they ...,2
