# Content Analysis Project

My project is about analyzing online forum discussions related to the topic of adoptiom. To accomplish this task I gathered textual data from 2 popular subreddits with the aims of discussing adoption: r/Adoption and r/Adopted. The full and combine data set is available at this [link](https://uchicago.box.com/s/qhblnxta8j0b2nc6gexywqr81jgr9er2).

In [72]:
import pandas as pd
import numpy as np
import ast
import re
import json
import sklearn.decomposition
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, roc_curve, roc_auc_score, confusion_matrix

# ! pip install tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras import layers, models
from tensorflow.keras.metrics import Precision, Recall, AUC
from tensorflow.keras.layers import Dense, Dropout, Embedding, GlobalMaxPooling1D, LSTM
# ! pip install tensorflow-addons
from tensorflow_addons.layers import SparseDense
from keras.preprocessing.text import Tokenizer
from keras.preprocessing.sequence import pad_sequences

from tqdm import tqdm
import spacy
import gensim
from gensim.models import KeyedVectors, LdaModel

import nltk
from nltk.probability import FreqDist
from nltk.corpus import stopwords
import wordcloud 
# ! python -m spacy download en_core_web_sm
nlp = spacy.load("en_core_web_sm")

# data viz
import matplotlib.pyplot as plt
from matplotlib.colors import LinearSegmentedColormap as LSC
import seaborn as sns



ImportError: cannot import name 'SparseDense' from 'tensorflow_addons.layers' (c:\Users\Ethan\anaconda3\Lib\site-packages\tensorflow_addons\layers\__init__.py)

In [3]:
adoptee_labels = ['Adoptee (UK)',
 ' Adoptee of Closed Adoption',
 'Domestic Infant Adoptee',
 'Transracial Adoptee',
 'Adoptee',
 'International Adoptee',
 'Baby Scoop Era Adoptee',
 'Adult Adoptee (DIA)',
 'Mentally ill adopted teen',
 'adopted at birth',
 'Failed Adoptee',
 'Reunited Adoptee',
 'Adoptee ❤️',
 'Adopted at 2 from Ukraine to the USA',
 'Teen Adoptee, open adoption',
 'Adoptee and Birth Parent',
 'in adoption limbo ...',
 'adopted family divorcee, adopted by birth mom',
 'adoptee in reunion',
 'adoptee',
 'Korean-American  Adoptee',
 'Adoptee with 6 parents',
 'Happily reunited adoptee',
 '32/M/adoptee/in-reunion',
 'Adopted @ 6yo',
 '(Adoptee, 1973)',
 'Adoptee, looking',
 'Adoptee/Step Dad',
 'Adoptee and Birthmother',
 'Adoptee recently reunited',
 'Adopted',
 'Adoptee Moderator',
 'Adopted, trad/closed, Ohio',
 'Adoptee, Foster Mama',
 'Adult Adoptee',
 'Closed At-Birth Adoptee',
 'Foster Adoptee',
 'Reunited adoptee',
 'adoptee, closed adoption, seeking reunification',
 'Chinese Adoptee',
 'late-discovery-adoptee',
 'Korean Adoptee, Married, CF',
 'Adopted at birth',
 'Russian - Kiwi Adoptee',
 'Not Quite Adopted',
 'adopted',
 'INFP: The Dreamer',
 'Transracial adoptee',
 'Closed Adoption Adoptee',
 'Adoptee Recently Reuinited',
 'transracial &amp; transnational adoptee',
 'Transracial US Domestic Adult Adoptee',
 'Korean Adoptee',
 'LDA, FFY, Indigenous adoptee',
 'Adoptee',
 'Chinese Adoptee in Canada (23F)',
 '1970 Closed Adoption Adoptee',
 'adopted from China at 12mo',
 'Open Adoption Adoptee',
 'LDA, FFY, Indigenous adoptee',
 'Russian Adoptee',
 'International adoptive mom of two (Vietnam)',
 'Russian adoptee',
 'neo city 💚',
 'Pre-Adoptive / Prospective Parents (PAP)',
 'Teen Adoptee',
 'Live, Love, Learn',
 'transracial adoptee',
 'adoptee // 23',
 'TRA/ICA/KAD (minor)',
 'Adult Adoptee🤍',
 'Adult adoptee, hoping to adopt',
 'Punjabi-Canto transracial adoptee',
 'Asian Adoptee',
 'adoptee',
 'Adoptee From USA',
 'recently found my bio fam :)',
 'non paternal event / LDA',
 'Adopted',
 'Late Disclosure Adoptee, Future Adoptive Parent',
 'Private Infant Adoptee - 24F',
 'adopted at infancy',
 '(Lifelong Open) Adoptee',
 'Adoptee (🇨🇳 —&gt; 🇫🇷)',
 'Late Discovery Adoptee (LDA)',
 'International Asian TRA',
 "int'l adoptee",
 'TRA',
 'Teen Adoptee, open adoption',
 'Adoptee &amp; Genealogical Detective!',
 'adult adoptee',
 'multiracial // transracial adoptee | prioritizing adoptee voices',
 'step adoptee',
 'half-adopted, hap',
 'Adoptee @ 106 Days &amp; Genealogical Detective!',
 'Adoptee; Confused as Hell',
 'Second-generation adoptee',
 'TRA/ICA',
 'Adoptee (domestic infant adoption)',
 'Closed Adoption Infant Adoptee',
 'UK Adoptee',
 'Closed domestic (US) infant adoptee in reunion',
 'adopted & hap',
 'Adopted in the late 60’s',
 'Transracial Adult Adoptee',
 'Adoptee @ 106 Days & Genealogical Detective',
 'Adoptee & AP',
 'adoptee & parent',
 'Domestic Infant Adoptee',
 '🇷🇺',
 'Closed domestic (US) adult adoptee in reunion',
 'Black adult invisible adoptee',
 'Chinese American Adoptee',
 'Adoptee (US)',
 '60s scoop reunited',
 'transracial adoptee',
 'DIA in Reunion',
 'Adoptee of Closed Adoption',
 'Private Infant Adoptee - 25F',
 'transracial closed adoptee',
 "adoptee '87",
 'TRA / Chinese adoptee',
 'Late discovery adoptee, 26 yrs. Met bio families.',
 'Adult DIA Adoptee',
 'late age adoptee',
 'adoptee & birthparent',
 'victim of domestic & state violence via transracial adoption',
 'Adoptee, Birthmother, & Parent',
 'Domestic Adoptee 1988',
 'BIA adoptee',
 'foster care (2007-2010) / adopted (2010)',
 'adoptee open adoption',
 'Adoptee and Psychologist',
 'Who am I?',
 'domestic infant(ish) adoptee',
 'Adoptee of Failed Adoption',
 'Adoptee and  Birthmother',
 'adoptee + adoptive parent',
 'Reunited Adoptee &amp; Adoptee Rights Activist',
 'Adoptee, Adoptive Parent',
 'adoptee 3.11.87',
 'Adopted Faery',
 'Child of two families'
 'Adopted Kid',
 'Adopted from Bangladesh',
 'KAD wutup!',
 'Adopted: birth.  Found bio siblings: age 20.',
 'Adoptee, 29F',
 'Closed adoption: birth. Found bio siblings: age 20.',
 'Adoptee /  Adoptive Parent',
 'Adoptee Found Birth Family',
 'adoptee; foster parent',
 'Adopted/Plans to adopt',
 'Transracial Adoptee (KAD)',
 'Birthmom+Adoptee',
 'Adopted at Birth | Found Birthfamily',
 'Foster Parent/Adoptee',
 'Moderator, adoptee',
 'Two moms, two dads, lucky reunited adoptee',
 'adoptee / plans to adopt',
 'Adopted 1993 | Reunited 2017',
 'Adopted as a baby',
 'Chinese adoptee',
 'Adoptee (International)',
 'Adoptee, Birthmother, Adoptive parent',
 'Adoptee, Activist',
 'Adopted from Russia',
 'Adoptee - Found birth family',
 'Punjabi-Canto interracial adoptee',
 'Korean adoptee',
 'Transracial Adoptee &amp; Birth Mother',
 'Trans-Racial Adoptee | PAP | Anti-Natalist',
 'Kazakh adoptee',
 'late-discovery adoptee, ex-ward',
 'LDA, ex-ward, Indigenous post-ICWA adoptee',
 'Adult Adoptee/Found Bio Parents - Ohio 1986 Prive Adoption',
 'Transnational Adoptee from Birth',
 "author, the adoptee's guide to dna testing (book)",
 'LDA, ex-ward, Indigenous adoptee',
 'Adoptee, Only Child',
 'Closed Adoption Adoptee Reunited',
 'late-discovery-adoptee, ex-foster-kid',
 'Adoptee &amp; Adopter',
 'Taiwanese Adoptee',
 'r/Adoptee Moderator',
 'Closed DIA',
 'Adopted aged four',
 'Adult adoptee',
 'Adopted @11days - reunited @ 27y/o',
 'TRA/IA/LDA/AP/FP',
 'intrafamily adoptee, school aged adoptee',
 'Adoptee/closed Birthmom/open',
 'china adoptee',
 'Childhood adoptee/Birthmother to now adult',
 'First Nations Adoptee',
 'Chinese Transracial Adoptee',
 'second-generation adoptee',
 'International adoptee',
 'International Transracial Adoptee',
 'From Russia with Love?',
 'FFY/Adoptee',
 'TransAdoptedKid',
 'adoptee and 23 ✌️',
 'Adopted Person',
 'adoptee/former foster kid',
 'Pre-Adoptive Parent | Adopted',
 'adoptee // 24',
 'victim of domestic &amp; state violence via transracial adoption',
 'cambodian adoptee',
 'Adult Adoptee Found BioFamily',
 'Birth adoptee reunited w/BM &amp; Half-Siblings',
 'Adoptee, may consider adoption in the future',
 'Adopted from China',
 'Adopted at birth',
 'Transracial Indigenous Adoptee',
 'Adoptee from birth',
 'Adopted Chinese',
 'reunited adoptee',
 'TRA/ICA/KAD',
 'Adopted as an Infant',
 'Transracial/international Adoptee',
 'open adoptee from birth',
 'Half-adopted',
 'adoptee &amp; parent',
 'Adopted at Birth',
 'domestic adoptee at birth | found birthparents']

# for regex will need to add \\ to escape the escape characters
adoptee_labels = [re.escape(i) for i in adoptee_labels]
# | to signify "or" for future regex
adoptee_pattern = re.compile(r"{}".format("|".join(adoptee_labels)), 
                             re.IGNORECASE)

# Now do the same for non adoptee labels
non_adoptee_labels = ['Former Foster Youth',
 'Future AP',
 'Bio Parent',
 'Birthfather',
 'Prospective Adoptive Parent',
 'Birth Mother - Open Adoption',
 'Mom through private domestic open transracial adoption',
 'Reunited mother, former legal guardian, NPE',
 'Reunited Mom, Foster Mom, L8 Dscvry Adoptee-paternal side',
 'Reunited Birthparent.',
 'Current Intl AP, Past Temp Foster Child',
 'Birth Mom',
 'AP, former FP, ASis',
 'Birthmother.',
 'birthmother',
 'Birthmother 6/23/12',
 '14 adoptions in my family',
 'Hopeful AP',
 'Adoptive parent',
 'birth parent',
 'Adoptive Parent',
 'Birthmother, 2002',
 'Biological Father - searching',
 'looking to adopt',
 'adoptive father',
 'Father of sibling group of 3',
 'birth mom',
 'Hopeful adopter',
 'Adopting in Arkansas',
 'Birthmom',
 'Hoping to adopt',
 'Looking into Adopting',
 'caseyalexanderblog.wordpress.com',
 'naturalmother_8-14-01',
 'Someday-adopter',
 'Potential Birthmother',
 'waiting prospective AP',
 'future AP',
 'foster-to-adopt aunt/mom',
 'Adoptive Dad',
 'BP',
 'I Fostered &amp; Then Adopted',
 'Adoptive Parent/Orphanage Supervisor',
 'Adoptive Parent (fostadopt)',
 'hypervigilant.org',
 'Possible AP',
 'fost-adopt parent',
 'pre-adoptive parent',
 'Birth Parent',
 'Birth Father &amp; /r/OpenAdoption Owner',
 'RecentBM',
 'Birthmother',
 'Adoptive/Foster Mom',
 'Adoptive Father',
 'foster adopt',
 'Adopted Family Member?',
 'Prospective adoptive parent',
 'Pre-fostering | prospective foster',
 'First Mother',
 'Bio of 2, Adoptive of 2',
 'Sister adopted in x2, aunt adopted out x1',
 'Hopeful APs',
 'Researching foster/adoptive parenting',
 'future FAD parent',
 'Birthmother, Open Adoption',
 'Adopting!',
 'Adoptive Parent - Intercountry + Fostered',
 'Post-Adoptions social worker', 
 'Adoptive Parent, Data Analyst',
 'Bio sis',
 'Adoptive/Foster Parent',
 'ex-foster-kid',
 'kinship adoptive parent / foster parent',
 'Birthmother 2/13/2002',
 'Transracial Adoptive Parent/Foster Parent',
 'parent of adopted kids',
 'Adoptive Mama',
 'AP',
 'Birthfather 10/21/1986',
 'Interested, but no plans',
 'Planning to Adopt in the Future',
 'Future Foster/Adoptive Parent',
 'Birth mother',
 'Foster parent/Adoptive parent',
 'Firstmom',
 'Son, 12.. BirthMom',
 'Father of 3, all adopted',
 'kinship/foster parent',
 'Momma',
 '23F- Future Adopter',
 'Reunited Mom',
 'Foster parent',
 'Adoptive father',
 'firstmother 2001',
 'Adoptive mom of 3',
 'Adoptive Dad of 3, soon 6',
 'bio sibling',
 '5 failed matches, currently in #6 due winter 2016',
 'Soon to be mom',
 'Potential Adoptive Parent',
 'foster/pre-adoptive parent',
 'Intl Adoptive Parent',
 'In Progress',
 'Wanting to Adopt',
 'Birth Father',
 '2 failed matches, still hoping',
 'adoptive mom',
 '3 failed matches, still hoping',
 'Someday-adopter, adoptive sister',
 'Future adoptive parent',
 'Matched with an expectant mom due in winter 2015',
 'Adoptive Parents',
 'Birthmother (Open Adoption)',
 'AParent to teen',
 'Son, 8.. BirthMom',
 'Future Parent',
 'considering adopting',
 'Looking to adopt (Ontario)',
 'Homestudied and waiting',
 'Adopting thru Foster Care',
 'sister of adoptee; future adoptive parent',
 'Adoptive sister',
 'Adoptive Dad of 3, soon 5',
 'Adoption Researcher',
 'was a foster parent',
 'Parent',
 'hoping to adopt',
 'Father of 4 adopted sons',
 'birthmom 2010, beautiful boy!',
 'would like to adopt',
 'Foster-to-Adopt',
 'Birthmom 7/31/1992',
 'potential adoptive father',
 '-25-groomer-wannabe adopter',
 'Luckiest',
 'Homestudied hopeful adopter',
 '3 adopted',
 'sister of an adoptee',
 'parent of several adopted kids',
 'Birthmother, Daughter, Sister, Aunt, and Wife',
 'adoptDad',
 '(b-mom, 1976)',
 'Birthmom 3/15/98',
 'Adoption Specialist',
 'Fost-Adoptive parent of 3',
 'HAP',
 'Adoptive mom - open kinship',
 'sister of adoptee; hopeful future AP',
 'Birth Mom',
 'haole, male, father to a daughter who was adopted, but not by me',
 'may adopt in the future',
 'LGBT adoptive parent &amp; daughter of adoptee',
 'Adoptive Parent of Older Teen',
 'prospective/pre-adoptive parent',
 'Birthmom 2017',
 'Birthmom 12/18/18',
 'Furture adoptive mom, by choice.',
 'Adoptive Parent x3',
 'Future Father',
 'Adoptive Parent &amp; Spouse to Adoptee',
 'Adoptive Mama',
 'prospective adoptive parent',
 'Bio-Sis, Hopefully Future Adoptive Parent',
 'Kinship AP',
 'NY, Adoptive Parent, Permanency Specialist',
 'Prospective AP',
 'FFY - AP',
 'Perspective adoptive parent',
 'Adoptive Parent (International/Transracial)',
 'Birth Mum.',
 'Former foster kid. Almost-adopted more than once.',
 'Adoptive Mother | Australia',
 'Hoping to Adopt',
 'Parent by Adoption',
 'Researching PAP',
 'One Adopted (Kinship), Seven Bio',
 'Adoptive Parent (Kinship Via Husband)',
 'foster mom',
 'Younger Bio Sibling',
 'Foster/Adoptive parent',
 'AdoptiveParent',
 'Adult Child of Adoptee',
 'AP of teen',
 'adoptive parent',
 'Birth mom, 2017',
 'Birth Mum of two - adopted by force.',
 'buried under a pile of children',
 'Adoptive mom',
 'Daughter of 2 adoptees',
 'Adoptive Mother',
 'Reunited Birthparent.',
 'Foster Parent',
 'Adoptive Dad of 5 (2 sib grps from foster care)',
 'Birthmother 12/13/2002',
 'PAP',
 'Prospective Parent',
 'AP, former FP, ASis',
 'biological parent',
 'Birth Parent in StepParent Adoption',
 'Open Adoption Birth Father &amp; /r/OpenAdoption Owner',
 'Foster Parent, Child Welfare Public Health Professional',
 'Have adopted-in siblings; searching for adopted-out sister',
 'Pre-Placement Parent',
 '16|05.20.2020|Adoption',
 'Prep-Adoptive',
 'daughter of an adoptee',
 'Adoptive Mom',
 'reunited mom, lgl grdian, NPE',
 'Birth Mother',
 'NPE',
 'Bio-Sib of an adoptee',
 'Potential Foster Parent',
 'Foster / Adoptive Parent',
 'Foster Mom',
 'mother was adopted',
 'Stepmum to long lost adoptee / reunited',
 'daughter of an adoptee. future adoptive parent.',
 'Birthparent / Baby Girl due 12/28 :)',
 'Prospective Adoptive Mother',
 'Reunited Birthmom',
 'Foster/Adoptive Parent',
 'Hopeful Adoptive Parent',
 'foster parent',
 'Prospective Adoptive Parent',
 'hopeful foster parent',
 '15 adoptions in my family',
 'Reunited Birthparent.',
 'Reunited Bio Mom',
 'Adoptive Parent &amp; Adoptee’s spouse']

non_adoptee_labels = [re.escape(i) for i in non_adoptee_labels]
non_adoptee_pattern = re.compile(r"{}".format("|".join(non_adoptee_labels)), 
                                 re.IGNORECASE)

#### Helpers

In [4]:
# for opening reddit json
def open_reddit_json(file_path):
    """
    Takes a string of a json (of scrapped Reddit data) file and turns it into a dataframe
    Inputs:
        file_path (str): the file path of the JSON file
    Returns pandas DataFrame of pertinent information
    """
    file = open(file_path, "r")

    users = []
    user_flairs = []
    titles = []
    post_texts = []
    post_dates = []
    post_flairs = []
    scores = []
    n_comments_list = []
    links = []

    for line in file:
        post = json.loads(line)

        try:
            users.append(post.get("author", np.nan))
            user_flairs.append(post.get("author_flair_text", np.nan))
            titles.append(post.get("title", np.nan)) # will be nan for comments

            # post text data is located in different places if comment vs post
            post_text = post.get("selftext")
            if post_text:
                post_texts.append(post_text)
            else:
                post_texts.append(post.get("body", np.nan))

            post_dates.append(post.get("created_utc", np.nan))
            post_flairs.append(post.get("link_flair_text", np.nan))
            scores.append(post.get("score", np.nan))
            n_comments_list.append(post.get("num_comments", np.nan))
            if post.get("permalink"):
                links.append("https://old.reddit.com" + post.get("permalink", np.nan))
            else:
                links.append(np.nan)       
        except:
            continue            

    output = pd.DataFrame({"user": users,
                           "user_flair": user_flairs,
                           "title": titles,
                           "post_text": post_texts,
                           "post_date": post_dates,
                           "post_flair": post_flairs,
                           "score": scores,
                           "n_comments": n_comments_list,
                           "link": links})
    return output


def categorize_user(x):
    """
    Given a user flair (str) categorize as either adoptee, non-adoptee, 
        or NEI (not enough info)
    Input: X (str): a user flair
    Returns: 1 if adoptee, 0 if non-adoptee, 2 if NEI    
    """
    if adoptee_pattern.match(x):
        return 1  # Adoptee
    elif non_adoptee_pattern.match(x):
        return 0  # Non-adoptee
    else:
        return 2  # NEI

### HELPERS INSPIRED BY HW2 AND HW4 OF CONTENT ANALYSIS ###

# NOT ACTUALLY USED
# def word_tokenize(word_list):
#     """
#     Tokenize a list of words or a single string
#     Input: list of strings or a single string
#     Returns a list of tokenized strings
#     """
#     # Pass word list through language model
#     if isinstance(word_list, str):
#         word_list = [word_list]
#     doc = nlp(" ".join(word_list))    
#     return [token.text for token in doc if not token.is_punct and token.text.strip()]


# NOT ACTUALLY USED
# def normalize_tokens(word_list):
#     """
#     Tokenize and normalize a list of words
#     Inputs:
#         word_list: list of strings for words to be tokenized
#     Returns a list of normalized strings
#     """
#     # Convert word_list to a single string
#     if isinstance(word_list, list):
#         word_list = ' '.join(word_list)
    
#     # Tokenize the text and convert to lowercase
#     doc = nlp(word_list.lower())

#     # Extract normalized tokens
#     normalized = [str(w.lemma_) for w in doc 
#                   if not w.is_stop and not w.is_punct 
#                   and not w.like_num and len(w.text.strip()) > 0]

#     return normalized


# USED, logic stems from above two functions
def tokenize_and_normalize(word_list):
    """
    combine the logic for the two functions above
    """
    if isinstance(word_list, list):
        word_list = ' '.join(word_list)
    doc = nlp(word_list.lower())
    tokenzied = [token.text for token in doc 
                 if not token.is_punct and token.text.strip()]
    normalized = [str(w.lemma_) for w in doc 
                  if not w.is_stop and not w.is_punct 
                  and not w.like_num and len(w.text.strip()) > 0]

    return tokenzied, normalized


def tokenize_sents(word_list, model=nlp):
    """
    Tokenize a list of words using a specified model.

    Parameters:
        word_list (list): A list of words to be tokenized into sentences.
        model (Spacy model): the language model to be used for tokenization. 
            Defaults to nlp model.

    Returns:
        list: list of sentences extracted from the input text.
    """
    doc = model(word_list)
    sentences = [sent.text.strip() for sent in doc.sents]
    return sentences


def tag_sents_pos(sentences):
    """
    function which replicates NLTK pos tagging on sentences.
    """
    new_sents = []
    for sentence in sentences:
        new_sent = ' '.join(sentence)
        new_sents.append(new_sent)
    final_string = ' '.join(new_sents)
    doc = nlp(final_string)

    pos_sents = []
    for sent in doc.sents:
        pos_sent = []
        for token in sent:
            pos_sent.append((token.text, token.tag_))
        pos_sents.append(pos_sent)

    return pos_sents

### Exploratory Data Analysis

My data comes from 3 sources in total and were combined in the above into one large data set. The first source was a reddit archival site which contained all posts from the subreddit r/Adoption until December of 2022. Furthermore I created a webscrawler and scrapper to gather more reddit posts from r/Adoption for more current posts and also to get data for r/Adopted (because the subreddit was smaller, it was not archived by the [site](https://the-eye.eu/redarcs/) I used). In total there are around 330k posts and comments in total. I keep track of a few variables: user, user flair (if any), post_title (if post, otherwise NaN), post_text, post_date, post_flair (if any), score (total of upvotes and downvotes), n_comments (if applicable), link, and subreddit. I have also created a few more columns for ease of analysis: is_comment, full_text (title + post text if applicable), cleaner_text, full_tokens, word_count, norm_tokens, token_sents, norm_sents, POS_sents, and is_adoptee. Please note that the all_df is actually created during the next section data cleaning and data wrangling but is presented here for ease of understanding the entire data set.

In [None]:
# all_df = pd.read_pickle("NEW_all_df.pkl")

In [None]:
all_df.columns

In [None]:
all_df.shape

Let's take a look at all of the variables within this data set. 

In [None]:
all_df.dtypes

We notice that most of the columns are objects (usually strings or list of strings of some sort). Only a few are numeric (word_count, n_comments, and is_adoptee). We can see in the data cleaning section below that the is_adoptee column has values of 0, 1, and 2 (which was coded by hand).

### Data Cleaning, Merging, Wrangling

In [None]:
# for r/Adopted
adopted_df = pd.read_csv("D:\\hw\\adopt-proj\\adopt_posts.csv")
# change to date time
adopted_df['post_date'] = pd.to_datetime(adopted_df['post_date'])
adopted_df['subreddit'] = "r/Adopted"

# for r/Adoption
adoption_1 = pd.read_csv("D:\\hw\\adopt-proj\\adoption_posts.csv")

adoption_2_posts = open_reddit_json("D:\\hw\\adopt-proj\\Adoption_submissions.json")
adoption_2_comms = open_reddit_json("D:\\hw\\adopt-proj\\Adoption_comments.json")
adoption_2_posts["is_comment"] = False
adoption_2_comms["is_comment"] = True

# fix post date data type
adoption_1['post_date'] = pd.to_datetime(adoption_1['post_date'])
adoption_2_comms['post_date'] = pd.to_datetime(adoption_2_comms['post_date'].astype(int), unit='s').dt.tz_localize("UTC")
adoption_2_posts['post_date'] = pd.to_datetime(adoption_2_posts['post_date'].astype(int), unit='s').dt.tz_localize("UTC")

# combine all sources of r/Adoption data
adoption_df = pd.concat([adoption_1, adoption_2_posts, adoption_2_comms])
adoption_df['subreddit'] = "r/Adoption"

In [None]:
# combine both subreddits
all_df = pd.concat([adopted_df, adoption_df])
print("The unedited dataframe is size:", all_df.shape)
# drop na
all_df.dropna(subset=['post_text'], inplace=True)
# reset indices 
all_df.reset_index(drop=True, inplace=True)
all_df["full_text"] = (all_df.title.fillna("") +  ". " + 
                       all_df.post_text).replace(r"^. ", "", regex=True)

The unedited dataframe is size: (332300, 11)


In [None]:
# remove rows with nans or empty strings
all_df = all_df[all_df.dropna(subset=['full_text']).full_text != ""]
# for some reason there is this one weird character for score in one row
all_df.replace("•", np.nan, inplace=True)
all_df["score"] = all_df.score.astype(float)

In [None]:
useless_posts = r"^(?:\(|\[)?(?:deleted|removed)(?:\)|\])?[\s-]*"
# filter out the useless posts
all_df = all_df[~all_df['post_text'].str.contains(useless_posts, case=False, regex=True)]

In [None]:
# remove duplicates
all_df.drop_duplicates(subset=["user","title", "subreddit", "post_text", "full_text"], inplace=True)

In [None]:
# because the info is stored in full_text, we don't need these columns anymore
all_df.drop(columns=["title", "post_text"], inplace=True)
all_df.reset_index(drop=True, inplace=True)

In [None]:
# let's add labels and non labels
all_df["is_adoptee"] = all_df["user_flair"].astype(str).apply(categorize_user)

Now that we have filtered out rows will not be using in the analysis, it is time to move onto further data cleaning and wrangling

In [None]:
nonenglish_syms = r"""[^a-zA-Z\d \.\?\,\!\-\#\:\:\\\\\/]"""
straight_curly = r"[\’\‘\“\”]"
# filter created with help from ChatGPT
url_pattern = r"http[s]?://(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\\(\\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"

In [None]:
# stop word stuff
# set a stopwords set
stop_words = set(stopwords.words('english'))
# add more stop words
custom_stopwords = set(["could've", "would've", "r", "u/", "u", "/r" "r/", "t", 've', 's', 'm', 
                        'll', 'd', 're', 'n', 'y', 'b', 'p', 'f', 'c', 'e', 'g', 
                        'h', 'j', 'k', 'l', 'o', 'q', 'v', 'w', 'x', 'z', 'a', 'i', "gt", "amp"])

stop_words.update(custom_stopwords)
remove_stops = r'\b(?:{})\b'.format('|'.join(stop_words))
# update nlp stopwords vocab
for stopword in stop_words:
    nlp.vocab[stopword].is_stop = True


Clean the the text

In [None]:
all_df["cleaner_text"] = all_df.full_text.str.lower().replace(url_pattern, " ",
                                        regex=True).replace(nonenglish_syms, " ", 
                                        regex=True)

Process the text (tokenize, normalize, POS tagging) for words and sentences

In [None]:
# for progress_apply bars
tqdm.pandas()

In [None]:
all_df["tokens"], all_df["norm_tokens"] = zip(*all_df.cleaner_text.progress_apply(lambda x: tokenize_and_normalize(x)))

100%|██████████| 302429/302429 [1:08:24<00:00, 73.69it/s] 


In [None]:
all_df["toke_sents"], all_df["norm_sents"] = zip(*all_df.cleaner_text.progress_apply(lambda x: zip(*[tokenize_and_normalize(s) for s in tokenize_sents(x)])))

100%|██████████| 302429/302429 [3:44:27<00:00, 22.46it/s]    


In [None]:
all_df['POS_sents'] = all_df.toke_sents.progress_apply(lambda x: tag_sents_pos(x))

100%|██████████| 302429/302429 [1:05:45<00:00, 76.64it/s] 


In [12]:
# add counts
all_df["num_tokens"] = all_df.tokens.apply(len)
all_df["num_norm_tokens"] = all_df.norm_tokens.apply(len)

In [13]:
all_df.to_pickle("UPDATED_all_df.pkl")

### word2vec models

#### create models

In [None]:
# create models for adoptee and non adoptee
adoptee_model = gensim.models.word2vec.Word2Vec(all_df[
    all_df.is_adoptee == 1].norm_sents.explode().dropna().reset_index(drop=True))
non_adoptee_model = gensim.models.word2vec.Word2Vec(all_df[
    all_df.is_adoptee == 0].norm_sents.explode().dropna().reset_index(drop=True))
adoptee_model.save("adoptee_word2vec.model")
non_adoptee_model.save("non_adoptee_word2vec.model")

In [None]:
# create full model for all data and data for each subreddit
all_adopt_model = gensim.models.word2vec.Word2Vec(all_df[all_df.subreddit == 
        "r/Adoption"].norm_sents.explode().dropna().reset_index(drop=True))
r_adopted_model = gensim.models.word2vec.Word2Vec(all_df[all_df.subreddit == 
        "r/Adopted"].norm_sents.explode().dropna().reset_index(drop=True))
r_adoption_model = gensim.models.word2vec.Word2Vec(
    all_df.norm_sents.explode().dropna().reset_index(drop=True))

all_adopt_model.save("NEW_all_adopt_word2vec.model")
r_adopted_model.save("NEW_r_adopted_word2vec.model")
r_adoption_model.save("NEW_r_adoption_word2vec.model")

In [None]:
# train models for the other data
sub_X_train_sum = sub_X_train.norm_sents.explode().dropna().reset_index(drop=True)
train_sub_model = gensim.models.word2vec.Word2Vec(sub_X_train_sum)
train_sub_model.save("train_sub_word2vec.model")

In [None]:
sub_X_test_sum = sub_X_test.norm_sents.explode().dropna().reset_index(drop=True)
test_sub_model = gensim.models.word2vec.Word2Vec(sub_X_test_sum)
test_sub_model.save("test_sub_word2vec.model")

In [None]:
adp_X_train_sum = adp_X_train.norm_sents.explode().dropna().reset_index(drop=True)
train_adp_model = gensim.models.word2vec.Word2Vec(adp_X_train_sum)
train_adp_model.save("train_adp_word2vec.model")

In [None]:
adp_X_train_sum = adp_X_test.norm_sents.explode().dropna().reset_index(drop=True)
test_adp_model = gensim.models.word2vec.Word2Vec(adp_X_train_sum)
test_adp_model.save("test_adp_word2vec.model")

### load data and models

In [5]:
all_df = pd.read_pickle("UPDATED_all_df.pkl")

In [6]:
all_adopt_model = gensim.models.word2vec.Word2Vec.load("NEW_all_adopt_word2vec.model")

In [7]:
# train test split
sub_X_train, sub_X_test, sub_y_train, sub_y_test = train_test_split(all_df.drop("subreddit",
                                                                                axis=1), 
                                                                    all_df.subreddit, 
                                                                    test_size=0.2, 
                                                                    random_state=42)

adp_X_train, adp_X_test, adp_y_train, adp_y_test = train_test_split(all_df.drop("is_adoptee",
                                                                                axis=1), 
                                                                    all_df.is_adoptee, 
                                                                    test_size=0.2, 
                                                                    random_state=42)