# Prepare Dataset for Essay-level and Sentence-level Classification: Attainment

*Author: Gian Baldonado*

In [1]:
import pandas as pd
import re
import csv
from nltk import word_tokenize,pos_tag
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords 
from nltk import stem
from nltk import sent_tokenize
from fuzzywuzzy import fuzz

from sklearn.model_selection import train_test_split



In [3]:
lemmatizer = WordNetLemmatizer()
stop_words = set(stopwords.words('english'))

## 1.) Load the essays

Specifically for **Attainment**, we use `Attainment_CapitalsPilot_Fall2019.xlsx` file because the Alma team has finished labelling essays of the theme in fall 2019. The next effort to label essays with this theme is in 2023 when research assistants over the summer validated files that outputs of TACCTI. 

There are 570 essays in the file above.

In [25]:
# Read the essays and create a dataframe
df = pd.concat(pd.read_excel('../data-analysis/data/annotated_xlsx/Attainment_CapitalsPilot_Fall2019.xlsx', sheet_name=None),ignore_index=True,sort=False)

In [26]:
df.shape

(570, 4)

In [27]:
df.head()

Unnamed: 0,ID CODE,Essay : Why am I here,Attainment - FINAL,First Gen - FINAL
0,pilot_study_01,"I am here to get my BS in Physiology, so I can...",I am here to get my BS in physiology so I can ...,0
1,pilot_study_02,I want to stay in my community & help educate ...,0,0
2,pilot_study_03,I am here to provide myself with opportunities...,0,0
3,pilot_study_04,I am here to figure out what I want to do with...,0,0
4,pilot_study_05,I am here at San Francisco State because I wan...,0,0


In [28]:
# Ignore first gen column because we're only interested in attainment for now.
df = df.drop(columns=['First Gen - FINAL'])

In [29]:
df.rename(columns={"Essay : Why am I here": 'essay', "Attainment - FINAL":"phrase" }, inplace=True)
df

Unnamed: 0,ID CODE,essay,phrase
0,pilot_study_01,"I am here to get my BS in Physiology, so I can...",I am here to get my BS in physiology so I can ...
1,pilot_study_02,I want to stay in my community & help educate ...,0
2,pilot_study_03,I am here to provide myself with opportunities...,0
3,pilot_study_04,I am here to figure out what I want to do with...,0
4,pilot_study_05,I am here at San Francisco State because I wan...,0
...,...,...,...
565,F19.PHYS242.05.04,Why am I here? †I am at San Francisco State Un...,I am at San Francisco State University to get ...
566,F19.PHYS242.05.05,"Why am I here? Honestly, I think I'm here by s...",0
567,F19.PHYS242.05.06,One of the main reasons why I came to Californ...,0
568,F19.PHYS242.05.07,When I was a junior and senior in high school ...,0


## 2.) Prepare essay-level dataset

In [30]:
essay_df = df.copy()
essay_df["label"] = ["Yes" if text != 0 else "No" for text in essay_df["phrase"]]
essay_df

Unnamed: 0,ID CODE,essay,phrase,label
0,pilot_study_01,"I am here to get my BS in Physiology, so I can...",I am here to get my BS in physiology so I can ...,Yes
1,pilot_study_02,I want to stay in my community & help educate ...,0,No
2,pilot_study_03,I am here to provide myself with opportunities...,0,No
3,pilot_study_04,I am here to figure out what I want to do with...,0,No
4,pilot_study_05,I am here at San Francisco State because I wan...,0,No
...,...,...,...,...
565,F19.PHYS242.05.04,Why am I here? †I am at San Francisco State Un...,I am at San Francisco State University to get ...,Yes
566,F19.PHYS242.05.05,"Why am I here? Honestly, I think I'm here by s...",0,No
567,F19.PHYS242.05.06,One of the main reasons why I came to Californ...,0,No
568,F19.PHYS242.05.07,When I was a junior and senior in high school ...,0,No


In [11]:
def print_counts(df, col):
    value_counts = col.value_counts()
    print("Counts:\n", value_counts)
    percentages = (value_counts / len(df)) * 100
    print("Percentages:\n", percentages)

In [17]:
test = pd.read_csv("./new_data/attainment/attainment_attainment_fall2019_2023_sentence_test_data.csv")
train = pd.read_csv("./new_data/attainment/attainment_attainment_fall2019_2023_sentence_training_data.csv")

In [18]:
print_counts(train, train["label"])

Counts:
 label
No     2644
Yes     360
Name: count, dtype: int64
Percentages:
 label
No     88.015979
Yes    11.984021
Name: count, dtype: float64


In [19]:
print_counts(test, test["label"])

Counts:
 label
No     662
Yes     90
Name: count, dtype: int64
Percentages:
 label
No     88.031915
Yes    11.968085
Name: count, dtype: float64


In [383]:
print_counts(clean_df, clean_df["label"])

Counts:
 label
No     4199
Yes     231
Name: count, dtype: int64
Percentages:
 label
No     94.785553
Yes     5.214447
Name: count, dtype: float64


## 3.) Prepare sentence-level dataset

In [519]:
# Make a copy of df
clean_df = df.copy()
# UNIQUE TO SENTENCE-LEVEL DATASET: Drop essays not containing the theme
clean_df = clean_df[~clean_df['phrase'].isin([0, '-', 'Unable to code'])]

We need to split the essays and phrases into sentences. `/%/` is an omit operator, we use it to indicate that there is something in the middle of the two sentences, like an ellipsis.

In [13]:
def tokenize_sents(text):
    sentences = re.split('\.|\?|\!',text)
    valid_sentences = [s for s in sentences if len(s.split()) > 3]
    return valid_sentences

In [14]:
def clean_text(text, no_punc=False):
    # Define a regex pattern
    if no_punc:
        pattern = r'[^a-zA-Z0-9]'
    else:
        pattern = r'[^a-zA-Z0-9.,!?;:\'"()\[\]{}]+'
    # Replace "/%/" with a period (".")
    cleaned_text = re.sub(r'/%/', '.', text)
    
    # Keep only alphanumeric characters and/or common punctuation
    cleaned_text = re.sub(pattern, ' ', cleaned_text)
    
    # Remove extra spaces and convert to lowercase
    cleaned_text = re.sub(r'\s+', ' ', cleaned_text).strip().lower()
    return cleaned_text


In [15]:
def clean_list(list_text, no_punc=False):
    clean_list_text = []

    if type(list_text) != list:
        return list_text
    else:
        cleaned_text = ""
        for text in list_text:
            cleaned_text = clean_text(text, no_punc=no_punc)
            clean_list_text.append(cleaned_text)
        
        return clean_list_text

We can actually check how many sentences we should have positive for attainment by splitting the phrases into sentences and counting all the sentences.

In [523]:
# Clean the text, include punctuations
clean_df["clean_essay"] = [clean_text(x) if type(x) == str else x for x in clean_df["essay"]]
clean_df["clean_phrase"] = [clean_text(x) if type(x) == str else x for x in clean_df["phrase"]]

clean_df["split_essay"] = [tokenize_sents(x) if type(x) == str else x for x in clean_df["clean_essay"]]
clean_df["split_phrase"] = [tokenize_sents(x) if type(x) == str else x for x in clean_df["clean_phrase"]]

# Clean the rows, exclude punctuations
clean_df["split_essay"] = [clean_list(row, no_punc=True) for row in clean_df["split_essay"]]
clean_df["split_phrase"] = [clean_list(row, no_punc=True) for row in clean_df["split_phrase"]]

# Convert integer 0 values to empty lists using list comprehension
clean_df['split_essay'] = [x if x != 0 else [] for x in clean_df['split_essay']]
clean_df['split_phrase'] = [x if x != 0 else [] for x in clean_df['split_phrase']]

In [524]:
count_len = 0

for i in clean_df["split_phrase"]:
    if type(i) == list:
        add = len(i)
        count_len = count_len+add

print("Number of sentences in the phrase column (no. of positive sentences):", count_len)

Number of sentences in the phrase column (no. of positive sentences): 233


In [525]:
count_len = 0

for i in clean_df["split_essay"]:
    if type(i) == list:
        add = len(i)
        count_len = count_len+add

print("Total number of sentences in the essay column: ", count_len)

Total number of sentences in the essay column:  1405


In [526]:
clean_df

Unnamed: 0,ID CODE,essay,phrase,clean_essay,clean_phrase,split_essay,split_phrase
0,pilot_study_01,"I am here to get my BS in Physiology, so I can...",I am here to get my BS in physiology so I can ...,"i am here to get my bs in physiology, so i can...",i am here to get my bs in physiology so i can ...,[i am here to get my bs in physiology so i can...,[i am here to get my bs in physiology so i can...
8,pilot_study_09,I’m here to learn and to be on the path toward...,I’m here to learn and to be on the path toward...,i m here to learn and to be on the path toward...,i m here to learn and to be on the path toward...,[i m here to learn and to be on the path towar...,[i m here to learn and to be on the path towar...
9,pilot_study_10,I am in SFSU because I want to go to medical s...,I am in SFSU because I want to go to medical s...,i am in sfsu because i want to go to medical s...,i am in sfsu because i want to go to medical s...,[i am in sfsu because i want to go to medical ...,[i am in sfsu because i want to go to medical ...
11,pilot_study_12,I am here in SFSU because I am the first gener...,My goal is to graduate with a degree B.A./B.S.,i am here in sfsu because i am the first gener...,my goal is to graduate with a degree b.a. b.s.,[i am here in sfsu because i am the first gene...,[my goal is to graduate with a degree b]
13,pilot_study_14,I am here because of my goal in trying to get ...,I am here because of my goal in trying to get ...,i am here because of my goal in trying to get ...,i am here because of my goal in trying to get ...,[i am here because of my goal in trying to get...,[i am here because of my goal in trying to get...
...,...,...,...,...,...,...,...
551,F19.PHYS232.07.23,I'm here because of prerequisite for my comput...,And my dream job is to be a software engineer.,i'm here because of prerequisite for my comput...,and my dream job is to be a software engineer.,[i m here because of prerequisite for my compu...,[and my dream job is to be a software engineer]
559,F19.PHYS232.09.08,The reason I am here is to do experiments and ...,I am an aspiring Engineer and I love doing ex...,the reason i am here is to do experiments and ...,i am an aspiring engineer and i love doing exp...,[the reason i am here is to do experiments and...,[i am an aspiring engineer and i love doing ex...
561,F19.PHYS232.09.10,I am in college to further my education. There...,I am studying to become an electrical engineer.,i am in college to further my education. there...,i am studying to become an electrical engineer.,"[i am in college to further my education, ther...",[i am studying to become an electrical engineer]
562,F19.PHYS242.05.01,I am here at SFSU for a multitude of reasons. ...,My second is to receive a degree in Engineerin...,i am here at sfsu for a multitude of reasons. ...,my second is to receive a degree in engineerin...,"[i am here at sfsu for a multitude of reasons,...",[my second is to receive a degree in engineeri...


In [527]:
# Explode the split_essay column by sentences into rows 
clean_df = clean_df.explode('split_essay', ignore_index=True)

# Check for similarity matches using fuzzy
def check_label(sentence, split_phrase):
    for phrase in split_phrase:
        if fuzz.ratio(sentence, phrase) >= 53:  # threshold 53 gives us total of 231 labels, which matches with 231 sentences that are positive for the theme
            return 'Yes'
    return 'No'

# Apply the check_label function to create the 'label' column
clean_df['label'] = clean_df.apply(lambda row: check_label(row['split_essay'], row['split_phrase']), axis=1)

# Reset the index
clean_df.reset_index(drop=True, inplace=True)


In [528]:
print_counts(clean_df, clean_df["label"])

Counts:
 label
No     1171
Yes     234
Name: count, dtype: int64
Percentages:
 label
No     83.345196
Yes    16.654804
Name: count, dtype: float64


In [529]:
sentence_df = clean_df.copy()

In [530]:
sentence_df

Unnamed: 0,ID CODE,essay,phrase,clean_essay,clean_phrase,split_essay,split_phrase,label
0,pilot_study_01,"I am here to get my BS in Physiology, so I can...",I am here to get my BS in physiology so I can ...,"i am here to get my bs in physiology, so i can...",i am here to get my bs in physiology so i can ...,i am here to get my bs in physiology so i can ...,[i am here to get my bs in physiology so i can...,Yes
1,pilot_study_01,"I am here to get my BS in Physiology, so I can...",I am here to get my BS in physiology so I can ...,"i am here to get my bs in physiology, so i can...",i am here to get my bs in physiology so i can ...,for this class i am trying to make friends con...,[i am here to get my bs in physiology so i can...,No
2,pilot_study_09,I’m here to learn and to be on the path toward...,I’m here to learn and to be on the path toward...,i m here to learn and to be on the path toward...,i m here to learn and to be on the path toward...,i m here to learn and to be on the path toward...,[i m here to learn and to be on the path towar...,Yes
3,pilot_study_09,I’m here to learn and to be on the path toward...,I’m here to learn and to be on the path toward...,i m here to learn and to be on the path toward...,i m here to learn and to be on the path toward...,i m here to be a better student and a better v...,[i m here to learn and to be on the path towar...,No
4,pilot_study_09,I’m here to learn and to be on the path toward...,I’m here to learn and to be on the path toward...,i m here to learn and to be on the path toward...,i m here to learn and to be on the path toward...,i set goals for myself and i m here to achieve...,[i m here to learn and to be on the path towar...,No
...,...,...,...,...,...,...,...,...
1400,F19.PHYS242.05.04,Why am I here? †I am at San Francisco State Un...,I am at San Francisco State University to get ...,why am i here? i am at san francisco state uni...,i am at san francisco state university to get ...,i can possibly minor or double major in econom...,[i am at san francisco state university to get...,No
1401,F19.PHYS242.05.04,Why am I here? †I am at San Francisco State Un...,I am at San Francisco State University to get ...,why am i here? i am at san francisco state uni...,i am at san francisco state university to get ...,why the switch to engineering,[i am at san francisco state university to get...,No
1402,F19.PHYS242.05.04,Why am I here? †I am at San Francisco State Un...,I am at San Francisco State University to get ...,why am i here? i am at san francisco state uni...,i am at san francisco state university to get ...,as a kid i always wanted to go into an enginee...,[i am at san francisco state university to get...,No
1403,F19.PHYS242.05.04,Why am I here? †I am at San Francisco State Un...,I am at San Francisco State University to get ...,why am i here? i am at san francisco state uni...,i am at san francisco state university to get ...,by the time i got here at sfsu i decided to ju...,[i am at san francisco state university to get...,No


In [537]:
sentence_df = sentence_df[["clean_essay", "split_essay", "label"]]
sentence_df.rename(columns={"clean_essay":"key_essay", "split_essay":"sentences"}, inplace=True)

In [538]:
sentence_df

Unnamed: 0,key_essay,sentences,label
0,"i am here to get my bs in physiology, so i can...",i am here to get my bs in physiology so i can ...,Yes
1,"i am here to get my bs in physiology, so i can...",for this class i am trying to make friends con...,No
2,i m here to learn and to be on the path toward...,i m here to learn and to be on the path toward...,Yes
3,i m here to learn and to be on the path toward...,i m here to be a better student and a better v...,No
4,i m here to learn and to be on the path toward...,i set goals for myself and i m here to achieve...,No
...,...,...,...
1400,why am i here? i am at san francisco state uni...,i can possibly minor or double major in econom...,No
1401,why am i here? i am at san francisco state uni...,why the switch to engineering,No
1402,why am i here? i am at san francisco state uni...,as a kid i always wanted to go into an enginee...,No
1403,why am i here? i am at san francisco state uni...,by the time i got here at sfsu i decided to ju...,No


In [1]:
def create_train_test_csvs(df, name, level="", test_size=0.1):
    train, test = train_test_split(df, test_size=test_size, random_state=18, stratify=df['label'])
    train.to_csv(f"./new_data/attainment/attainment_{name}_{level}_training_data.csv", encoding='UTF-8', index=False)
    test.to_csv(f"./new_data/attainment/attainment_{name}_{level}_test_data.csv",  encoding='UTF-8', index=False)
    print(f"Successfully created attainment_{name}_{level}_level train and test sets.")



In [3]:
attainment_fall2019_fall2023_df = pd.read_csv("./new_data/attainment/attainment_sentence_fall2019_2023_dataset.csv")

In [8]:
attainment_fall2019_fall2023_df

Unnamed: 0,key_essay,sentences,label
0,i'm here because of prerequisite for my comput...,so yea i m not even really sure why i m legit ...,No
1,throughout my entire life i have dreamt about ...,but this dream ended up sticking with me for t...,No
2,i am here to better myself and hopefully give ...,also the reason why i am here in san francisco...,No
3,i m here because i need help to pass ochem. i ...,i love science and even though it s hard and t...,No
4,"i often ask myself this question, ""why am i he...",and even though it hasn t been easy it definit...,No
...,...,...,...
3751,i am here in this physics lab class because it...,it can give me a wider understanding of atoms ...,No
3752,i am here because i want to further advance my...,my family continually inspires me to be a bett...,No
3753,the main reason why i am here is because i nee...,and i wanted to be considered a full time stud...,No
3754,i am here because i want to get the education ...,i really want to be able to look back at my li...,No


In [6]:
attainment_fall2019_fall2023_df[attainment_fall2019_fall2023_df["label"] == "No"]

Unnamed: 0,key_essay,sentences,label
0,i'm here because of prerequisite for my comput...,so yea i m not even really sure why i m legit ...,No
1,throughout my entire life i have dreamt about ...,but this dream ended up sticking with me for t...,No
2,i am here to better myself and hopefully give ...,also the reason why i am here in san francisco...,No
3,i m here because i need help to pass ochem. i ...,i love science and even though it s hard and t...,No
4,"i often ask myself this question, ""why am i he...",and even though it hasn t been easy it definit...,No
...,...,...,...
3751,i am here in this physics lab class because it...,it can give me a wider understanding of atoms ...,No
3752,i am here because i want to further advance my...,my family continually inspires me to be a bett...,No
3753,the main reason why i am here is because i nee...,and i wanted to be considered a full time stud...,No
3754,i am here because i want to get the education ...,i really want to be able to look back at my li...,No


In [7]:
attainment_fall2019_fall2023_df[attainment_fall2019_fall2023_df["label"] == "Yes"]

Unnamed: 0,key_essay,sentences,label
5,i am here in physics 222 because it s part of ...,i am here in physics 222 because it s part of ...,Yes
8,i love biology. right after 12th grade i knew ...,i want to have a career as an epidemiologist v...,Yes
17,i am here to actually understand what i am bei...,that means i have a higher chance of getting e...,Yes
27,i am here to obtain a better understanding of ...,in doing so i can get a better grade in my cur...,Yes
29,i am here because i have this feeling i would ...,i am here because i have this feeling i would ...,Yes
...,...,...,...
3718,the reason i am here today is to strengthen my...,class with a b or higher but we will see about...,Yes
3738,"i am here to get a degree, network, and build ...",i am here to get a degree network and build sk...,Yes
3739,why am i here? i'm here cause my parents broug...,after graduating i d like to go to dental scho...,Yes
3743,i am here in this physics lab class because it...,i feel like knowing all this information can b...,Yes


In [10]:
create_train_test_csvs(attainment_fall2019_fall2023_df, "attainment_fall2019_2023", level="sentence", test_size=0.2)

Successfully created attainment_attainment_fall2019_2023_sentence_level train and test sets.


In [543]:
essay_df.drop(columns="ID CODE", inplace=True)


KeyError: "['ID CODE'] not found in axis"

In [501]:
sentence_df = sentence_df.rename(columns={"split_essay":"sentences"}, inplace=True)


AttributeError: 'NoneType' object has no attribute 'rename'

In [544]:
create_train_test_csvs(essay_df, "fall_2019", "essay")
create_train_test_csvs(sentence_df, "fall_2019", "sentence")

Successfully created attainment_fall_2019_essay_level train and test sets.
Successfully created attainment_fall_2019_sentence_level train and test sets.


## Checking train and test files

### Essay-level

In [468]:
essay_train = pd.read_csv("./new_data/attainment/attainment_essay_training_data.csv")

In [469]:
essay_train.head()

Unnamed: 0,ID CODE,essay,phrase,label
0,F18.SCI111.03.03,The reason why I am here is because I want to ...,0,No
1,F19.PHYS232.07.13,I needed to take this course since my major re...,0,No
2,F19.PHYS122.08.12,The reason why I am here is because my major r...,0,No
3,S19.PHYS112.06.04,Why am I here? I am taking classes at San Fran...,I eventually decided to quit this job to full-...,Yes
4,F19.PHYS112.08.09,"""Why am I here?""Such an abstract question; why...",0,No


In [470]:
essay_train.shape

(513, 4)

In [471]:
essay_test = pd.read_csv("./new_data/attainment/attainment_essay_test_data.csv")
essay_test.shape

(57, 4)

In [472]:
print("Essay-level training set stats:")
print_counts(essay_train, essay_train["label"])

Essay-level training set stats:
Counts:
 label
No     341
Yes    172
Name: count, dtype: int64
Percentages:
 label
No     66.471735
Yes    33.528265
Name: count, dtype: float64


In [466]:
print("Essay-level test set stats:")
print_counts(essay_test, essay_test["label"])

Essay-level test set stats:
Counts:
 label
No     19
Yes    10
Name: count, dtype: int64
Percentages:
 label
No     65.517241
Yes    34.482759
Name: count, dtype: float64


### Sentence-level

In [452]:
sentence_train = pd.read_csv("./new_data/attainment/attainment_sentence_training_data.csv")

In [453]:
sentence_train.head()

Unnamed: 0,clean_essay,split_essay,label
0,i am in physics because i need to take the cla...,at the same time physics makes sense because i...,No
1,i am in this physics class because it is requi...,i expect my professors in both lab and lecture...,No
2,i know that i was put on this earth to be do s...,i am not sure why college is so challenging fo...,No
3,why am i here? i am here to work on my civil e...,i am here to work on my civil engineering degree,Yes
4,why am i here?i am here in this astronomy lab ...,i have also been passionate about these two th...,No


In [454]:
sentence_train.shape

(1264, 3)

In [455]:
sentence_test = pd.read_csv("./new_data/attainment/attainment_sentence_test_data.csv")
sentence_test.shape

(141, 3)

In [456]:
print("Sentence-level training set stats:")
print_counts(sentence_train, sentence_train["label"])

Sentence-level training set stats:
Counts:
 label
No     1053
Yes     211
Name: count, dtype: int64
Percentages:
 label
No     83.306962
Yes    16.693038
Name: count, dtype: float64


In [457]:
print("Sentence-level test set stats:")
print_counts(sentence_test, sentence_test["label"])

Sentence-level test set stats:
Counts:
 label
No     118
Yes     23
Name: count, dtype: int64
Percentages:
 label
No     83.687943
Yes    16.312057
Name: count, dtype: float64


Now we want to do the same for the new dataset `Attainment_Capitals_TACCTI_HumanValidated_Fall2023.xlsm`

## Creating a function



In [4]:
fall_2023_df = pd.read_excel("./new_data/attainment/Attainment_Capitals_TACCTI_HumanValidated_Fall2023.xlsm")

In [5]:
fall_2023_df.head()

Unnamed: 0,essay,label,phrase
0,Reflection #1: Why Am I Here? “Why am I here...,Yes,"Particularly, it was all the injuries that I h..."
1,I am here because my future goal is to heal pe...,Yes,"I would like to become a physical therapist, ..."
2,Reflection: At the moment I am feeling very st...,No,0
3,"Holly Enrile PHYSICS LAB Reﬂection#1 Sept. 10,...",No,0
4,Karla Martinez Why I am here? One of the mai...,Yes,I am also here because I want to be a nurse


In [6]:
fall_2023_df

Unnamed: 0,essay,label,phrase
0,Reflection #1: Why Am I Here? “Why am I here...,Yes,"Particularly, it was all the injuries that I h..."
1,I am here because my future goal is to heal pe...,Yes,"I would like to become a physical therapist, ..."
2,Reflection: At the moment I am feeling very st...,No,0
3,"Holly Enrile PHYSICS LAB Reﬂection#1 Sept. 10,...",No,0
4,Karla Martinez Why I am here? One of the mai...,Yes,I am also here because I want to be a nurse
...,...,...,...
730,I am in this sci course because sci classes ha...,No,0
731,I’m here to get a better grasp on this subject...,No,0
732,I’m here to learn and expand my knowledge so I...,No,0
733,I am here to understand the material taught in...,No,0


In [480]:
if "phrase" in fall_2023_df.columns:
    print("yes")

yes


In [17]:
def prepare_essay_level_dataset(path_to_df, name):

    df = pd.read_excel(path_to_df)
    ## assumes dataframe is read and has at least an essay and phrase columns
    if "label" not in df.columns:
        # create a label column
        df["label"] = ["Yes" if text != 0 else "No" for text in essay_df["phrase"]]
    print_counts(df, df["label"])
    create_train_test_csvs(df, name, "essay")

In [18]:
prepare_essay_level_dataset("./new_data/attainment/Attainment_Capitals_TACCTI_HumanValidated_Fall2023.xlsm", "fall_2023")

Counts:
 label
No     443
Yes    292
Name: count, dtype: int64
Percentages:
 label
No     60.272109
Yes    39.727891
Name: count, dtype: float64
Successfully created attainment_fall_2023_essay_level train and test sets.


In [43]:
def prepare_sentence_level_dataset(path_to_df, name, fuzz_threhsold=80):
    df = pd.read_excel(path_to_df)
    ## assumes dataframe is read and has at least an essay and phrase columns
    df = df[~df['phrase'].isin([0, '-', 'Unable to code'])]

    # Clean the text, include punctuations
    df["clean_essay"] = [clean_text(x) if type(x) == str else x for x in df["essay"]]
    df["clean_phrase"] = [clean_text(x) if type(x) == str else x for x in df["phrase"]]

    df["split_essay"] = [tokenize_sents(x) if type(x) == str else x for x in df["clean_essay"]]
    df["split_phrase"] = [tokenize_sents(x) if type(x) == str else x for x in df["clean_phrase"]]

    # Clean the rows, exclude punctuations
    df["split_essay"] = [clean_list(row, no_punc=True) for row in df["split_essay"]]
    df["split_phrase"] = [clean_list(row, no_punc=True) for row in df["split_phrase"]]

    # Convert integer 0 values to empty lists using list comprehension
    df['split_essay'] = [x if x != 0 else [] for x in df['split_essay']]
    df['split_phrase'] = [x if x != 0 else [] for x in df['split_phrase']]

    count_len = 0

    for i in df["split_phrase"]:
        if type(i) == list:
            add = len(i)
            count_len = count_len+add

    print("Number of sentences in the phrase column (no. of positive sentences):", count_len)

    count_len = 0

    for i in df["split_essay"]:
        if type(i) == list:
            add = len(i)
            count_len = count_len+add

    print("Total number of sentences in the essay column: ", count_len)

    # Explode the split_essay column by sentences into rows 
    df = df.explode('split_essay', ignore_index=True)

    # Check for similarity matches using fuzzy
    def check_label(sentence, split_phrase):
        for phrase in split_phrase:
            if fuzz.ratio(sentence, phrase) >= fuzz_threhsold:  # threshold 
                return 'Yes'
        return 'No'

    # Apply the check_label function to create the 'label' column
    df['label'] = df.apply(lambda row: check_label(row['split_essay'], row['split_phrase']), axis=1)

    # Reset the index
    df.reset_index(drop=True, inplace=True)

    sentence_df = df[["clean_essay", "split_essay", "label"]]
    sentence_df.rename(columns={"clean_essay":"key_essay", "split_essay":"sentences"}, inplace=True)
    
    print_counts(sentence_df, sentence_df["label"])
    # create_train_test_csvs(sentence_df, name, "sentence")


In [512]:
fall_2023_df

Unnamed: 0,essay,label,phrase
0,Reflection #1: Why Am I Here? “Why am I here...,Yes,"Particularly, it was all the injuries that I h..."
1,I am here because my future goal is to heal pe...,Yes,"I would like to become a physical therapist, ..."
2,Reflection: At the moment I am feeling very st...,No,0
3,"Holly Enrile PHYSICS LAB Reﬂection#1 Sept. 10,...",No,0
4,Karla Martinez Why I am here? One of the mai...,Yes,I am also here because I want to be a nurse
...,...,...,...
730,I am in this sci course because sci classes ha...,No,0
731,I’m here to get a better grasp on this subject...,No,0
732,I’m here to learn and expand my knowledge so I...,No,0
733,I am here to understand the material taught in...,No,0


In [535]:
prepare_sentence_level_dataset("./new_data/attainment/Attainment_Capitals_TACCTI_HumanValidated_Fall2023.xlsm", "fall_2023", 100) #100 threshold because we copy-pasted sentences word for word.

Number of sentences in the phrase column (no. of positive sentences): 311
Total number of sentences in the essay column:  2351
Counts:
 label
No     2135
Yes     216
Name: count, dtype: int64
Percentages:
 label
No     90.81242
Yes     9.18758
Name: count, dtype: float64
Successfully created attainment_fall_2023_sentence_level train and test sets.


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_df.rename(columns={"clean_essay":"key_essay", "split_essay":"sentences"}, inplace=True)


In [536]:
pd.read_csv("./new_data/attainment/attainment_fall_2023_sentence_training_data.csv")

Unnamed: 0,key_essay,sentences,label
0,"i am at san francisco state university, becaus...",i am at san francisco state university because...,No
1,i am here because i know i struggle in chemist...,my major is microbiology the clinical lab scie...,No
2,i m in college because i know that i can make ...,i m in love with research and the beautiful ne...,No
3,"why am i here? well, i have to take this class...",physics has always come easier to me and scien...,No
4,why am i here? why am i here? i am here to be ...,why am i here,No
...,...,...,...
2110,i am here in this physics lab class because it...,it can give me a wider understanding of atoms ...,No
2111,i am here because i want to further advance my...,my family continually inspires me to be a bett...,No
2112,the main reason why i am here is because i nee...,and i wanted to be considered a full time stud...,No
2113,i am here because i want to get the education ...,i really want to be able to look back at my li...,No


In [541]:
pd.read_csv("./new_data/attainment/attainment_fall_2019_sentence_training_data.csv")

Unnamed: 0,key_essay,sentences,label
0,i am in physics because i need to take the cla...,at the same time physics makes sense because i...,No
1,i am in this physics class because it is requi...,i expect my professors in both lab and lecture...,No
2,i know that i was put on this earth to be do s...,i am not sure why college is so challenging fo...,No
3,why am i here? i am here to work on my civil e...,i am here to work on my civil engineering degree,Yes
4,why am i here?i am here in this astronomy lab ...,i have also been passionate about these two th...,No
...,...,...,...
1259,i'm here because of prerequisite for my comput...,and my dream job is to be a software engineer,Yes
1260,why am i here?i am here in this astronomy lab ...,this subject is intriguing but i am pursuing a...,Yes
1261,i am taking sci 111 because it is a lower divi...,i am taking sci 111 because it is a lower divi...,Yes
1262,i am here to obtain a better understanding of ...,mostly i am in this sci class so that i can ge...,No


In [545]:
pd.read_csv("./new_data/attainment/attainment_fall_2019_essay_training_data.csv")

Unnamed: 0,essay,phrase,label
0,The reason why I am here is because I want to ...,0,No
1,I needed to take this course since my major re...,0,No
2,The reason why I am here is because my major r...,0,No
3,Why am I here? I am taking classes at San Fran...,I eventually decided to quit this job to full-...,Yes
4,"""Why am I here?""Such an abstract question; why...",0,No
...,...,...,...
508,When asked the question of why I am here I thi...,I found my calling in sciences and am now purs...,Yes
509,I am here because this is the only lab class t...,0,No
510,Hello everyone!I'm not only taking this course...,0,No
511,"The reason I am here at SF state was, at first...",0,No


In [547]:
pd.read_csv("./new_data/attainment/attainment_fall_2023_essay_test_data.csv")

Unnamed: 0,essay,label,phrase
0,I’m here because I want to learn more about th...,No,0
1,John Ellis Jr2/18/2020I am here in this SCI cl...,No,0
2,To be completely honest I am taking this cours...,No,0
3,I am here in this lab class because I needed t...,No,0
4,Emily Pham 09/10/20 Physics 102 Reflec...,No,0
...,...,...,...
69,I am here because I want to further my underst...,Yes,I am here because I am pursuing a further ed...
70,The reason why I am here is to seek help with ...,No,0
71,I am here in this physics lab class because it...,Yes,I feel like knowing all this information can ...
72,This isn't my first so class and from the past...,No,0


In [32]:
fall_2023_df

Unnamed: 0,essay,label,phrase
0,Reflection #1: Why Am I Here? “Why am I here...,Yes,"Particularly, it was all the injuries that I h..."
1,I am here because my future goal is to heal pe...,Yes,"I would like to become a physical therapist, ..."
2,Reflection: At the moment I am feeling very st...,No,0
3,"Holly Enrile PHYSICS LAB Reﬂection#1 Sept. 10,...",No,0
4,Karla Martinez Why I am here? One of the mai...,Yes,I am also here because I want to be a nurse
...,...,...,...
730,I am in this sci course because sci classes ha...,No,0
731,I’m here to get a better grasp on this subject...,No,0
732,I’m here to learn and expand my knowledge so I...,No,0
733,I am here to understand the material taught in...,No,0


In [33]:
orig_attainment_df = essay_df.copy()
orig_attainment_df.drop(columns="ID CODE", inplace=True)

In [40]:
all_attainment_data = pd.concat([fall_2023_df, orig_attainment_df]).reset_index(drop=True)

In [41]:
all_attainment_data

Unnamed: 0,essay,label,phrase
0,Reflection #1: Why Am I Here? “Why am I here...,Yes,"Particularly, it was all the injuries that I h..."
1,I am here because my future goal is to heal pe...,Yes,"I would like to become a physical therapist, ..."
2,Reflection: At the moment I am feeling very st...,No,0
3,"Holly Enrile PHYSICS LAB Reﬂection#1 Sept. 10,...",No,0
4,Karla Martinez Why I am here? One of the mai...,Yes,I am also here because I want to be a nurse
...,...,...,...
1300,Why am I here? †I am at San Francisco State Un...,Yes,I am at San Francisco State University to get ...
1301,"Why am I here? Honestly, I think I'm here by s...",No,0
1302,One of the main reasons why I came to Californ...,No,0
1303,When I was a junior and senior in high school ...,No,0


In [42]:
print_counts(all_attainment_data, all_attainment_data["label"])

Counts:
 label
No     822
Yes    483
Name: count, dtype: int64
Percentages:
 label
No     62.988506
Yes    37.011494
Name: count, dtype: float64


In [49]:
all_attainment_data.to_excel("unclean_all_attainment.xlsx")

In [52]:
prepare_sentence_level_dataset("./unclean_all_attainment.xlsx", "sentence_level_all_attainment_data")

Number of sentences in the phrase column (no. of positive sentences): 544
Total number of sentences in the essay column:  3756
Counts:
 label
No     3308
Yes     448
Name: count, dtype: int64
Percentages:
 label
No     88.072417
Yes    11.927583
Name: count, dtype: float64


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  sentence_df.rename(columns={"clean_essay":"key_essay", "split_essay":"sentences"}, inplace=True)


In [56]:
a_df = pd.read_csv("./new_data/attainment/attainment_sentence_fall2019_2023_dataset.csv")

In [57]:
print_counts(a_df, a_df["label"])

Counts:
 label
No     3306
Yes     450
Name: count, dtype: int64
Percentages:
 label
No     88.019169
Yes    11.980831
Name: count, dtype: float64
