## Import libraries

In [1]:
import os
import pandas as pd
from drs_corpora import * #import helper functions
nlp = spacy.load("en_core_web_sm") # load the spaCy model

## Set parameters

In [11]:
# Set some parameters
PROJECT = 'anxiety'
DOCBIN_SIZE = None
SAVEDIR = f"../../intermediate_data/{PROJECT}_docbins"
DOCBIN_FILENAME_PATTERN = SAVEDIR + "/" + PROJECT

# Create a folder for the docbins of your project within intermediate_data if not exists
try:
    os.mkdir(SAVEDIR)
    print("Directory created. Path to docbins folder:", SAVEDIR)
except FileExistsError:
    print("Path to docbins folder:", SAVEDIR)

Path to docbins folder: ../../intermediate_data/anxiety_docbins


## Read Raw Data

In [3]:
# Specify the path to the folder containing CSV files
input_path = '../../input_data/'

# Create a DataFrame with the accounts and the groups they belong to (professional, experience)
account_groups_df = pd.read_json(f"{input_path}account_groups.json", typ='series').reset_index().rename(columns={"index": "account", 0: "category"})

# Initialize an empty list to store the DataFrames
dfs = []

# Loop through each file in the folder
for filename in os.listdir(input_path):
    if filename.endswith(".csv"):
        # Construct the full path to the CSV file
        file_path = os.path.join(input_path, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Filter non-null posts
df = df[df["body"].isna() == False]
documents = list(df["body"])

# Save to Excel
#df.to_excel(f"{input_path}anxiety_test.xlsx")

## Tokenization, Lemmatization, Named Entity Recognition, and POS Tagging

In [4]:
%%time
nlp.add_pipe("merge_entities") # this makes the tokens cleaner later

#we split up all the documents into a bunch of ".db" files, or "docbin" files that hold our text data.

N_PROCESS = 10 # you can change this depending on how many cores your CPU has (eg. a Mac M1 has 8 cores, so you can use up to 7 here)
BATCH_SIZE = 100 # this depends on how much RAM you have. If this process hangs or crashes, you may want to reduce batch size (how many docs each core will process in one chunk)
DOCBIN_SIZE = N_PROCESS * BATCH_SIZE

print("docbin size for this project: ", str(DOCBIN_SIZE), "\nPlease take note of the docbin size if you want to come back to your project and not process the data again.")

for i, chunk in enumerate(chunker(documents, DOCBIN_SIZE)): # chopping our dataset into chunks. We don't need this in our toy example, but we do for large datsets; change from 100 if you want
    doc_bin = DocBin(store_user_data = True) # create a docbin for our chunk
    for doc in nlp.pipe(chunk, n_process = N_PROCESS, batch_size = BATCH_SIZE): # process our documents, you can play with n_process and batch_size depending on your CPU and RAM
        doc_bin.add(doc) # save the document to our docbin
    chunk_name = DOCBIN_FILENAME_PATTERN + "_docbin_" + str(i) + ".db" # make a nice filename for each chunk
    print("Saving chunk as: ", chunk_name) # display progress
    doc_bin.to_disk(chunk_name) # save docbin for chunk to disk

docbin size for this project:  1000 
Please take note of the docbin size if you want to come back to your project and not process the data again.
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_0.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_1.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_2.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_3.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_4.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_5.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_6.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_7.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_8.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_9.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_1

## Frequency

In [12]:
# before we can do anything, we need to get a list of all the names of the docbins, in order, thats what happens here

docbin_folder = SAVEDIR + "/*.db" # you might have to change this if you use this notebook for a different folder
docbins = [docbin for docbin in glob.iglob(docbin_folder)]
DOCBINS = list()
for i in range(0, len(docbins), 1):
    db_path = DOCBIN_FILENAME_PATTERN + "_docbin_" + str(i) + ".db" # here the naming pattern of the docbins is hard coded, so you may have to change this if you apply it to another project
    DOCBINS.append(db_path)

In [15]:
# Let's do the counting!

subtotals = [docbin_counter(docbin, nlp) for docbin in DOCBINS] # we will apply a counting function to each docbin here
total = Counter() # we set up a blank counter which will consolidate all the docbin-level totals
for subtotal in subtotals: # this loop does the counting
    total.update(subtotal)
fdist = fdist2table(total, savename = "../../output_data/words_frequency.xlsx") # save it to excel, you can make the filename whatever you want

Frequency distribution saved!


In [16]:
# show top 20 NOUN, VERB, ADJ in a table
fdist.query("label == 'NOUN'").sort_values("count", ascending = False).head(20)

Unnamed: 0,word,label,count
48,anxiety,NOUN,31711
575,life,NOUN,23492
11786,judgement,NOUN,16500
3464,growth,NOUN,16056
14240,⁣,NOUN,13235
9,health,NOUN,12942
40507,keepgoing,NOUN,11238
13480,comprehension,NOUN,10967
57,thing,NOUN,8323
1782,#mentalhealth #,NOUN,8214


## Collocation

In [17]:
cl_df = collocator_main(
    
    ("life", "NOUN"),   ## you can change this, but pay attention to the format!
    DOCBINS, 
    nlp, 
    total, 
    window_size = 3,   ## you can change this, we will discuss this in class (window size = 2 -> just next to the one we have)
    remove_stopwords = True ## option for advanced use
    
)

# this will look a little clunky, and as you can see, some minor errors need fixing; this is the most complicated computation.

cl_df.to_excel("../../output_data/words_collocation.xlsx", index=True, engine = "xlsxwriter")

Error encountered, printing contingency table values...
 A: 3  B: 23489  C: 0  D: 1582920
Error encountered, printing contingency table values...
 A: 4  B: 23488  C: 0  D: 1582920
Error encountered, printing contingency table values...
 A: 3  B: 23489  C: 0  D: 1582920
Error encountered, printing contingency table values...
 A: 3  B: 23489  C: 0  D: 1582920
Error encountered, printing contingency table values...
 A: 3  B: 0  C: 23489  D: 1582920
Error encountered, printing contingency table values...
 A: 3  B: 0  C: 23489  D: 1582920
Error encountered, printing contingency table values...
 A: 4  B: 23488  C: 0  D: 1582920
Error encountered, printing contingency table values...
 A: 25  B: 23467  C: -7  D: 1582927
Error encountered, printing contingency table values...
 A: 5  B: 0  C: 23487  D: 1582920
Error encountered, printing contingency table values...
 A: 6  B: 0  C: 23486  D: 1582920
Error encountered, printing contingency table values...
 A: 6  B: 23486  C: 0  D: 1582920
Error en

## Concordance

In [18]:
all_hits = concordancer(
    DOCBINS,
    ("comprehension","NOUN"),
    5, # this is how on either side to look for (window size)
    nlp,
    sample_size = 20,
    label = "comprehension|NOUN"
)

#### Observe: apparently most keywords are only used as hashtgas

## Keyness

In [36]:
# To compute keyness, first we need a frequency distribution of a subset of our posts to compare with the total.

keyness_fdist = sliced_docbin_word_counter(
    
    DOCBINS,
    df,
    nlp,
    slice_value = "healthanxietycoach", # we are slicing on a value, so here we can put in the username we want
    slice_variable = "author", # this tells us which column of our data table to find the value above
    remove_stopwords = True,
    docbin_size = DOCBIN_SIZE
    
)

Counting words in docbins...
../../intermediate_data/anxiety_docbins/anxiety_docbin_0.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_1.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_2.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_3.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_4.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_5.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_6.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_7.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_8.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_9.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_10.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_11.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_12.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_13.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_14.db
../../intermediate_data/anxiety_docbins/anxiety_docbin_15.db
../..

In [37]:
# Then it is just a matter of statistics! We will use a Chi-Squared statistic and the PDIFF statistic proposed by Gabrielatos (2018)

kn_df = keyness_chi_sq(keyness_fdist, total, savename = "../../output_data/keywords_chisq.xlsx")
kn_df = keyness_pdiff(keyness_fdist, total, savename = "../../output_data/keywords_pdiff.xlsx")

In [None]:
kn_df.sort_values("pdiff", ascending=False).head(20)

## Hashtag and Social Media Noise

In [38]:
ht_fdist = hashtag_counter(documents, savename = "../../output_data/hashtags.xlsx") # this takes care of everything and makes an excel spreadsheet

Hashtag frequency distribution saved!
