## Code for uploading files to S3 Bucket

In [56]:
import pandas as pd
from tqdm import tqdm
# ! pip install pandarallel
import re, string
from bs4 import BeautifulSoup
import spacy
import nltk
from nltk.corpus import stopwords
# pip install lxml
import boto3
from tqdm import tqdm
# !python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

In [57]:
s3 = boto3.resource('s3')
BUCKET_NAME = 'ethan-kozlowski-project'

s3.create_bucket(Bucket=BUCKET_NAME)
bucket = s3.Bucket(BUCKET_NAME)

In [58]:
def upload_to_s3(file_name:str, bucket:str, object_name:str=None, s3=boto3.client('s3')):
    """
    Upload a file to an S3 bucket return True if successful, else False
    """
    # if no object name is given, just make it the same as the file name
    if object_name is None:
        object_name = file_name

    try:
        s3.upload_file(file_name, bucket, object_name)
    except Exception as e:
        print(e)
        return False
    return True

In [59]:
import pandas as pd
import re

In [60]:
old_data = r"D:\hw\adopt_proj\DATA_DO_NOT_REMOVE\all_posts_pre_cleaned.csv"
newer_posts = r"D:\hw\adopt_proj\DATA_DO_NOT_REMOVE\CRAWLEDnewest_posts_5-9.csv"

In [61]:
old_data_df = pd.read_csv(old_data)
newer_posts_df = pd.read_csv(newer_posts)

In [62]:
full_df = pd.concat([old_data_df, newer_posts_df]
                    ).drop_duplicates(
                        subset="id"
                    ).reset_index(
                        drop=True
                    ).drop(
                        columns=['url', 'upvote_ratio']
                    )
full_df["time"] = pd.to_datetime(full_df["time"], unit='s')
full_df["title"] = full_df.title.fillna("")
full_df["text"] = full_df.text.fillna("")

In [63]:
# Function to check if a string ends with punctuation and return the appropriate string
def format_full_text(title, text):
    if not title:
        full_text = text
    elif title.endswith(('.', '!', '?')):
        full_text = f"{title} {text}"
    else:
        full_text = f"{title}. {text}"
   
    return full_text.strip().lower()

In [64]:
full_df["full_text"] = full_df.apply(lambda row: format_full_text(row.title, row.text), axis=1)

In [65]:
replace_dict = {
    '’': "'",
    '‘': "'",
    '“': '"',
    '”': '"',
    "•": "",
    "…": " ",
    "&": "and",
    "è": "e",
    "é": "e",
    "ê": "e",
    "à": "a",
    "â": "a",
    "ô": "o",
    "û": "u",
    "ç": "c",
    "î": "i",
    "ï": "i",
    "ù": "u",
    "ü": "u",
    "°": "degree"
}

# replace these problematic characters especially for contractions used later
full_df["full_text"] = full_df.full_text.replace(replace_dict, regex=True)

In [66]:
# change the ID tags by removing the 't\d_' prefix
full_df.parent_id = full_df.apply(lambda row: re.sub(r't\d_', "", row.parent_id) if pd.notna(row.parent_id) else row.id, axis=1)
full_df.link_id = full_df.apply(lambda row: re.sub(r't\d_', "", row.link_id) if pd.notna(row.link_id) else row.id, axis=1)

In [67]:
full_df[full_df.author.str.contains(r"bot$", na=False)].author.unique()

array(['exclaim_bot', 'molliebot', 'Booty_Warrior_bot', 'could-of-bot',
       'sneakpeekbot', 'receptionist_robot', 'converter-bot',
       'of_patrol_bot', 'tiny_smile_bot', 'hotlinehelpbot', 'haikusbot',
       'serendipitybot', 'haiku-testbot', 'useles-converter-bot',
       'nicolesarobot', 'ectbot', 'overthinkingrobot', 'battbot',
       'totes_meta_bot', 'autowikibot', 'video_descriptionbot',
       '007Fembot', 'icarebot', 'linebreaker-bot', 'conversionbot',
       'wikipedia_answer_bot', 'cheer_up_bot', 'octopus_tigerbot',
       'JordanTheBrobot', 'EncouragementRobot', 'Lesbianadoptibot',
       'mortalitybot', 'image_linker_bot', 'HIPPAbot', 'IrishReplybot',
       'imdad_bot', 'youtubefactsbot', 'alternate-source-bot',
       'outline_link_bot', 'hipaa-bot', 'I_am_a_haiku_bot',
       'yourewelcome_bot', 'yourewelcome_botbot', 'these_days_bot',
       'nice___bot', 'resavr_bot', 'metric_robot', 'Chuck_Norris_Jokebot'],
      dtype=object)

In [68]:
# potential bots 

problem_users = ["I_am_a_haiku_bot", 
                 "happy-cake-day-bot-", 
                 "Chuck_Norris_Jokebot",
                 "yourewelcome_bot",
                 "yourewelcome_botbot",
                 "nice___bot",
                 "Booty_Warrior_bot",
                 "useles-converter-bot",
                 "IrishReplybot",
                 "hipaa-bot",
                 "metric_robot",
                 "autowikibot",
                 "resavr_bot",
                 "ectbot",
                 "exclaim_bot",
                 "EncouragementRobot",
                 "alternate-source-bot",
                 "could-of-bot",
                 "sneakpeekbot",
                 "youtubefactsbot",
                 "video_descriptionbot",
                 "conversionbot",
                 "serendipitybot",
                 "Squirrelslayer777", # this user is a troll
                 "wikipedia_answer_bot",
                 "tiny_smile_bot",
                 "hotlinehelpbot",
                 "outline_link_bot",
                 "totes_meta_bot",
                 "icarebot",
                 "linebreaker-bot",
                 "JordanTheBrobot",
                 "mortalitybot",
                 "image_linker_bot",
                 "imdad_bot"]

bots = pd.read_csv(r"D:\hw\adopt_proj\DATA_DO_NOT_REMOVE\bots.csv", header=None)
known_bots = set(bots[0]) | set(problem_users)

In [69]:
pd.Series(list(known_bots)).to_csv('bots.csv', index=False, header=False, 
                                   encoding='utf-8', escapechar='\\', quotechar='"', quoting=1)

In [70]:
# remove bots
full_df = full_df[~full_df.author.isin(known_bots)]

In [106]:
word_equivalents = {
    "adoptive mom": "amom",
    "amother": "amom",
    "adoptive sister": "asis",
    "adoptive sis": "asis",
    "adoptive brother": "abro",
    "adoptive bro" : "abro",
    "adoptive mother": "amom",
    "adoptive dad": "adad",
    "afather": "adad",
    "adoptive father": "adad",
    "adoptive parent": "ap",
    "adoptive parents": "aps",
    "bio mom": "bmom",
    "bmom": "bmom",
    "bio mother": "bmom",
    "biomother" :"bmom",
    "biomom": "bmom",
    "birthmom": "bmom",
    "birth mom": "bmom",
    "birth mother": "bmom",
    "biological mom": "bmom",
    "biological mother": "bmom",
    "birth mom": "bmom",
    "biological dad": "bdad",
    "biological father": "bdad",
    "bio dad": "bdad",
    "bdad": "bdad",
    "biodad": "bdad",
    "biofather": "bdad",
    "bio father": "bdad",
    "birth dad": "bdad",
    "birth father": "bdad",
    "birth parent": "bp",
    "biological parent": "bp",
    "bio parent": "bp",
    "bio parents": "bps",
    "birth parents": "bps",
    "biological parents": "bps",
    "birth sister": "bsis",
    "bio sis": "bsis",
    "bio sister": "bsis",
    "birth brother": "bbro",
    "bio bro": "bbro",
    "bio brother": "bbro",
    "birth sibling": "bsibling",
    "bio sibling": "bsibling",
    "bio sib": "bsibling",
    "bio family": "birthfamily",
    "birth family": "birthfamily",
    "biological family": "birthfamily",
    "adoptive family": "adoptive_family",
    "first mom": "first_mom",
    "first mother": "first_mom",
    "first dad": "first_dad",
    "first father": "first_dad",
    "first family": "first_family",
    "first fam": "first_family",
    "step mom": "stepmom",
    "stepmother": "stepmom",
    "stepfather": "stepdad",
    "step mother": "stepmom",
    "step dad": "stepdad",
    "step father": "stepdad",
    "step parent": "stepparent",
    "step parents": "stepparents",
    "step sister": "stepsister",
    "step brother": "stepbrother",
    "step sibling": "stepsibling",
    "stepfamily": "step_family",
    "stepfamilies": "step_families",
    "step families": "step_families",
    "step family": "step_family",
    "step sis": "stepsister",
    "step bro": "stepbrother",
    "step son": "stepson",
    "step daughter": "stepdaughter",
    "adoptive son" : "adoptive_son",
    "adoptive daughter": "adoptive_daughter",
    "adoptive child": "adoptive_child",
    "foster mom": "fostermom",
    "foster parent": "fosterparent",
    "foster parents": "fosterparents",
    "fostermother": "fostermom",
    "foster mother": "fostermom",
    "foster family": "foster_family",
    "foster families" : "foster_families",
    "foster dad": "fosterdad",
    "foster father": "fosterdad",
    "fosterfather": "fosterdad",
    "transracial adoptee": "tra",
    "half sister": "halfsister",
    "half brother": "halfbrother",
    "half sibling": "halfsibling",
    "half siblings": "halfsiblings",
    "half sis": "halfsister",
    "half bro": "halfbrother",
    "birth certificate" : "birth_certificate",
    "et cetera": "etc",
    "home town": "hometown",
    "home country": "homecountry",
    "birth place": "birthplace",
    "birth country": "birthcountry",
    "prospective adoptive parent" : "pap",
    "father-in-law": "father_in_law",
    "mother-in-law": "mother_in_law",
    "brother-in-law": "brother_in_law",
    "sister-in-law": "sister_in_law",
    "mum-in-law": "mother_in_law",
    "father in law": "father_in_law",
    "brother in law": "brother_in_law",
    "mother in law": "mother_in_law",
    "sister in law": "sister_in_law",
    "mum in law": "mother_in_law",
    "birthmum": "bmom",
    "birth mum": "bmom",
    "bio mum": "bmom",
    "biological mum": "bmom",
    "biomum": "bmom",
    "bmomma": "bmom",
    "son in law": "son_in_law",
    "daughter in law": "daughter_in_law",
    "ad ptee": "adoptee",
    "intercountry adoptee": "ica",
    "intercountry adoption": "ica",
    "united states": "usa",
    "united kingdom": "uk",
    "new york": "ny",
    "los angeles": "la",
    "former foster youth": "ffy",
    "puerto ric": "puerto_ric",
    "child protective services": "cps",
    "foster care": "fostercare",
    "south korea": "sk",
    "hong kong": "hk",
    "fiancee": "fiance",
    "fiancees": "fiances",
    "girl friend": "girlfriend",
    "boy friend": "boyfriend",
    "tl dr": "tldr",
    "tl:dr": "tldr",
    "family tree dna": "ftdna", # an ancestry dna website
    "bipolar depression": "bpd",
    "bi polar depression": "bpd",
    "pod cast": "podcast",
    "post traumatic stress disorder": "ptsd",
    "up vote": "upvote",
    "down vote": "downvote",
}

word_equiv_pattern = re.compile('|'.join(re.escape(key) for key in word_equivalents.keys()))

In [108]:
def decontracted(phrase):
    """
    Expand the contracted phrase into normal words
    """
    # specific
    phrase = re.sub(r"\bwon't\b", "will not", phrase)
    phrase = re.sub(r"\bcan\'t\b", "can not", phrase)

    # general
    phrase = re.sub(r"n\'t\b", " not", phrase)
    phrase = re.sub(r"\'re\b", " are", phrase)
    phrase = re.sub(r"\'s\b", " is", phrase) 
    phrase = re.sub(r"\'d\b", " would", phrase)
    phrase = re.sub(r"\'ll\b", " will", phrase)
    phrase = re.sub(r"\'t\b", " not", phrase)
    phrase = re.sub(r"\'ve\b", " have", phrase)
    phrase = re.sub(r"\'m\b", " am", phrase)
    
    return phrase


def remove_excess_punctuation(text):
    # Remove all but the last punctuation in a mixed sequence, and reduce any subsequent identical marks
    text = re.sub(r'([.!?])\s*([.!?]\s*)+', r'\1', text)

    # Correct spacing around punctuation
    text = re.sub(r'([.!?])([^\s])', r'\1 \2', text)

    # Trim spaces around end punctuation and remove redundant end punctuation
    text = re.sub(r'\s*([.!?])\s*$', r'\1', text)
    
    # Remove isolated punctuation marks within the text
    text = re.sub(r'\s+([!?])\s+', ' ', text)

    return text.strip()


def clean_text(text):
    """
    Clean the review texts
    """
   
    # expand the contracted words
    post_text = decontracted(text)
   
    # remove html tags
    post_text = BeautifulSoup(post_text, 'lxml').get_text().strip() # re.sub(r'<.*?>', '', text)
       
    # remove url 
    post_text = re.sub(r'https?://\S+|www\.\S+', '', post_text)
    # remove emails
    post_text = re.sub(r"(^[a-zA-Z0-9_.+-]+@[a-zA-Z0-9-]+\.[a-zA-Z0-9-.]+$)", '', post_text)
    


    # remove special characters
    post_text = re.sub(r'[^a-z\!\?\.]+', ' ', post_text)
    # post_text = re.sub(r'\"|\,|\/|\#|\$|\%|\&|\'|\(|\)|\:|\*|\+|\<|\=|\>|\@|\[|\]|\^|\_|\{|\||\}|\~|\`|…|\-|–|—|[0-9]', " " , post_text)
    # remove elipsses and artifacts of empty sentences
    post_text = remove_excess_punctuation(post_text)
    # replace subreddit mentions
    post_text = re.sub(word_equiv_pattern, lambda x: word_equivalents[x.group(0)], post_text)

    reddit_pattern = r"\b(\/r\/|r\/|r )([a-zA-Z0-9][a-zA-Z0-9_]{0,20})\b"
    post_text = re.sub(reddit_pattern, lambda m: 'r_' + m.group(2), post_text)
    # replace user name mentions
    reddit_user_name_pattern = r"\b(\/u\/|u\/|u )([a-zA-Z0-9_-]{2,20})\b"
    post_text = re.sub(reddit_user_name_pattern, lambda m: 'u_' + m.group(2), post_text) 
    # remove extra white space
    post_text = re.sub(r"[\s ]+", " ", post_text)
    
    return post_text.strip()

In [109]:
tqdm.pandas()
full_df["full_text"] = full_df.full_text.progress_apply(clean_text)

  post_text = BeautifulSoup(post_text, 'lxml').get_text().strip() # re.sub(r'<.*?>', '', text)
  post_text = BeautifulSoup(post_text, 'lxml').get_text().strip() # re.sub(r'<.*?>', '', text)
100%|██████████| 299893/299893 [01:53<00:00, 2639.39it/s]


In [110]:
full_df = full_df[full_df.author.notna()]

In [111]:
sorted_flairs = pd.read_csv(r"D:\hw\adopt_proj\DATA_DO_NOT_REMOVE\all_sorted_flairs.csv")
# create regular expression to filter known adoptees and non adoptees
adoptee_labels = "|".join([re.escape(flair) for flair in sorted_flairs.adoptee_flair.dropna().values])
non_adoptee_labels = "|".join([re.escape(flair) for flair in sorted_flairs.non_adoptee_flair.dropna().values])
adoptee_pattern = re.compile(r"{}".format(adoptee_labels), re.IGNORECASE)
non_adoptee_pattern = re.compile(r"{}".format(non_adoptee_labels), re.IGNORECASE)

def categorize_user(x):
    if adoptee_pattern.match(x):
        return 1  # Adoptee
    elif non_adoptee_pattern.match(x):
        return 0  # Non-adoptee
    else:
        return -1  # NEI

In [112]:
full_df.loc[:,"is_adoptee"] = full_df.author_flair.astype(str).apply(categorize_user).copy()

In [113]:
id_to_author = full_df[["author","id"]].set_index("id").author.to_dict()

full_df["target"] = full_df.parent_id.map(id_to_author)
full_df["indirect_target"] = full_df.link_id.map(id_to_author)

In [114]:
full_df[['author', 'is_adoptee', 'full_text', "subreddit", "target","indirect_target", "num_comments", "score"]].to_parquet('input.parquet', index=False)

In [115]:
full_df.head(15)

Unnamed: 0,parent_id,id,link_id,author,author_flair,num_comments,time,title,text,score,subreddit,full_text,is_adoptee,target,indirect_target
0,5k0hve,5k0hve,5k0hve,Shamaroo,,13.0,2016-12-24 00:55:47,I have no clue if this is the right place but ...,My wife works at a children's hospital not goi...,6,Adoption,i have no clue if this is the right place but ...,-1,Shamaroo,Shamaroo
1,5k0hve,dbkdylx,5k0hve,surf_wax,Adoptee,,2016-12-24 01:17:51,,Probably not going to happen. If she was just...,7,Adoption,probably not going to happen. if she was just ...,1,Shamaroo,Shamaroo
2,dbkdylx,dbkxkh9,5k0hve,usernamebrainfreeze,,,2016-12-24 14:37:07,,Also if your willing to consider taking in a s...,3,Adoption,also if your willing to consider taking in a s...,-1,surf_wax,Shamaroo
3,dbkdylx,dbkragy,5k0hve,thismoment76,,,2016-12-24 08:56:45,,I know a family that went through fostering a ...,2,Adoption,i know a family that went through fostering a ...,-1,surf_wax,Shamaroo
4,dbkdylx,dbkg6si,5k0hve,Shamaroo,,,2016-12-24 02:20:31,,Hey thanks for the reply even tho it's sucks t...,2,Adoption,hey thanks for the reply even tho it is sucks ...,-1,surf_wax,Shamaroo
5,dbkg6si,dbkgybr,5k0hve,surf_wax,Adoptee,,2016-12-24 02:41:44,,"Yeah, I get it -- there are some awful people ...",4,Adoption,yeah i get it there are some awful people out ...,1,Shamaroo,Shamaroo
6,dbkdylx,dbqdrz2,5k0hve,Starrcraters,,,2016-12-28 22:19:35,,You'd be surprised how quickly reunification a...,1,Adoption,you would be surprised how quickly reunificati...,-1,surf_wax,Shamaroo
7,5k0hve,dbl7ipa,5k0hve,ThrowawayTink2,,,2016-12-24 19:14:59,,It actually sounds like this baby was removed ...,3,Adoption,it actually sounds like this baby was removed ...,-1,Shamaroo,Shamaroo
8,5k0hve,dbkpyc8,5k0hve,ralpher1,,,2016-12-24 07:46:16,,Birth mom will need to consent to place the ch...,2,Adoption,bmom will need to consent to place the child f...,-1,Shamaroo,Shamaroo
9,dbkpyc8,dbqd9js,5k0hve,Starrcraters,,,2016-12-28 22:08:02,,Parents do not have to consent to adoption if ...,1,Adoption,parents do not have to consent to adoption if ...,-1,ralpher1,Shamaroo


In [116]:
upload_to_s3('input.parquet', BUCKET_NAME, 'input.parquet')

True

## Serial code that accomplishes the same thing

In [119]:
stop_words = set(stopwords.words('english'))
# add more stop words
custom_stopwords = set(["could've", "would've", "r", "u/", "u", '/u' "/r" "r/", "t", 've', 's', 'm', 
                        'll', 'nt', 'd', 're', 'n', 'y', 'b', 'p', 'f', 'c', 'e', 'g', 
                        'h', 'j', 'k', 'l', 'o', 'q', 'v', 'w', 'x', 'z', 'a', 'i', "gt", "amp", 
                        "like", "don", "just", "kinda", "want", "know", "think", "dosnt",
                        "get", "say", "go", "make", "andor", "yo", "andme", "bc", "nah",
                        "ect","soo", "sooo", "soooo", "sooooo", "soooooo", "sooooooo", "eachother",
                        "modmail", "cuz", "andnbsp", "los", "yoffe", "bc", "thier", "ou", "andnbsp",
                        
                        "ve","alot", "atleast", "yall", "notall", "noone", "eithe", "hai","tion"])
stop_words.update(custom_stopwords)
stop_words.update(nlp.Defaults.stop_words)
for stopword in stop_words:
    nlp.vocab[stopword].is_stop = True

In [120]:
def tokenize_and_normalize(word_list, custom_stopwords=None):
    """
    Tokenize and normalize text, with support for custom stopwords.

    Args:
    - word_list (list of str or str): List of words or a single string of text.
    - custom_stopwords (set of str): Additional stopwords to consider.

    Returns:
    - tuple: tokenized (list of str), normalized (list of str)
    """

    # Join the list into a single string if it's a list of words
    if isinstance(word_list, list):
        word_list = ' '.join(word_list)
    
    # Process the text with spaCy
    doc = nlp(word_list.lower())
    
    # Define the default stopwords and update with custom ones if provided

 
    # Tokenization and checking for punctuation
    tokenized = [token.text for token in doc if not token.is_punct and token.text.strip()]
    
    # Normalization while removing stopwords, punctuation, and numbers
    normalized = [str(token.lemma_) for token in doc 
                  if token.text not in custom_stopwords 
                  and not token.is_punct and not token.like_num and token.text.strip()]

    return tokenized, normalized


def tokenize_sents(word_list, model=nlp):
    """
    Tokenize a list of words using a specified model.

    Parameters:
        word_list (list): A list of words to be tokenized into sentences.
        model (Spacy model): the language model to be used for tokenization. 
            Defaults to nlp model.

    Returns:
        list: list of sentences extracted from the input text.
    """

    doc = model(word_list)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences


# created by ChatGPT to help me fix an error with tokenized_sents
def tokenize_edited(x, stop_words=None):

    if x.strip():  # Check if x is non-empty and not just whitespace
        tokenized = [tokenize_and_normalize(s, stop_words) for s in tokenize_sents(x)]
        if tokenized:  # Ensure there is something to unzip
            return zip(*tokenized)
    # Return empty tuples if no content
    return ([], [])


def tag_sents_pos(sentences):
    """
    function which replicates NLTK pos tagging on sentences.
    """
    new_sents = []
    for sentence in sentences:
        new_sent = ' '.join(sentence)
        new_sents.append(new_sent)
    final_string = ' '.join(new_sents)
    doc = nlp(final_string)

    pos_sents = []
    for sent in doc.sents:
        pos_sent = []
        for token in sent:
            pos_sent.append((token.text, token.tag_))
        pos_sents.append(pos_sent)

    return pos_sents

In [121]:
tqdm.pandas()
full_df["token_sents"], full_df["norm_sents"] = zip(*full_df.full_text.progress_apply(lambda x: tokenize_edited(x, stop_words)))
full_df["norm_tokens"] = full_df.norm_sents.progress_apply(lambda x: [item for sublist in x for item in sublist])
full_df["full_tokens"] = full_df.token_sents.progress_apply(lambda x: [item for sublist in x for item in sublist])
full_df['POS_sents'] = full_df.token_sents.progress_apply(lambda x: tag_sents_pos(x))
full_df["num_tokens"] = full_df.full_tokens.apply(len)
full_df["num_norm_tokens"] = full_df.norm_tokens.apply(len)

# takes 253 minutes 7.8 seconds to run

  0%|          | 0/299864 [00:00<?, ?it/s]

100%|██████████| 299864/299864 [3:15:13<00:00, 25.60it/s]   
100%|██████████| 299864/299864 [00:01<00:00, 160369.34it/s]
100%|██████████| 299864/299864 [00:03<00:00, 90230.62it/s]
100%|██████████| 299864/299864 [57:42<00:00, 86.61it/s]  


In [122]:
full_df.to_pickle(r"serial_output.pkl")