## Import libraries

In [2]:
import os
import pandas as pd
from drs_corpora import * #import helper functions
nlp = spacy.load("en_core_web_sm") # load the spaCy model

## Set parameters

In [3]:
# Set some parameters
PROJECT = 'anxiety'
DOCBIN_SIZE = None
SAVEDIR = f"../../intermediate_data/{PROJECT}_docbins"
DOCBIN_FILENAME_PATTERN = SAVEDIR + "/" + PROJECT

# Create a folder for the docbins of your project within intermediate_data if not exists
try:
    os.mkdir(SAVEDIR)
    print("Directory created. Path to docbins folder:", SAVEDIR)
except FileExistsError:
    print("Path to docbins folder:", SAVEDIR)

Path to docbins folder: ../../intermediate_data/anxiety_docbins


## Read Raw Data and create professional vs non-professional dfs

In [4]:
# Specify the path to the folder containing CSV files
input_path = '../../input_data/'

# Create a DataFrame with the accounts and the groups they belong to (professional, experience)
account_groups_df = pd.read_json(f"{input_path}account_groups.json", typ='series').reset_index().rename(columns={"index": "account", 0: "category"})

# Initialize an empty list to store the DataFrames
dfs = []

# Loop through each file in the folder
for filename in os.listdir(input_path):
    if filename.endswith(".csv"):
        # Construct the full path to the CSV file
        file_path = os.path.join(input_path, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

# Filter non-null posts
df = df[df["body"].isna() == False]

# Save to Excel
#df.to_excel(f"{input_path}anxiety.xlsx")

# Merge to accounts info
df = df.merge(account_groups_df, left_on = 'author', right_on = 'account')

# Create and save 2 dfs
professional_df = df[df['category'] == 'professional']
#professional_df.to_excel(f"{input_path}anxiety_professional.xlsx")

experience_df = df[df['category'] == 'experience']
#experience_df.to_excel(f"{input_path}anxiety_experience.xlsx")

In [10]:
len(experience_df)

13588

## Pick a dataset

In [113]:
#df = experience_df
#df = professional_df
documents = list(df["body"])

## Tokenization, Lemmatization, Named Entity Recognition, and POS Tagging

In [114]:
%%time
nlp.add_pipe("merge_entities") # this makes the tokens cleaner later

#we split up all the documents into a bunch of ".db" files, or "docbin" files that hold our text data.

N_PROCESS = 10 # you can change this depending on how many cores your CPU has (eg. a Mac M1 has 8 cores, so you can use up to 7 here)
BATCH_SIZE = 100 # this depends on how much RAM you have. If this process hangs or crashes, you may want to reduce batch size (how many docs each core will process in one chunk)
DOCBIN_SIZE = N_PROCESS * BATCH_SIZE

print("docbin size for this project: ", str(DOCBIN_SIZE), "\nPlease take note of the docbin size if you want to come back to your project and not process the data again.")

for i, chunk in enumerate(chunker(documents, DOCBIN_SIZE)): # chopping our dataset into chunks. We don't need this in our toy example, but we do for large datsets; change from 100 if you want
    doc_bin = DocBin(store_user_data = True) # create a docbin for our chunk
    for doc in nlp.pipe(chunk, n_process = N_PROCESS, batch_size = BATCH_SIZE): # process our documents, you can play with n_process and batch_size depending on your CPU and RAM
        doc_bin.add(doc) # save the document to our docbin
    chunk_name = DOCBIN_FILENAME_PATTERN + "_docbin_" + str(i) + ".db" # make a nice filename for each chunk
    print("Saving chunk as: ", chunk_name) # display progress
    doc_bin.to_disk(chunk_name) # save docbin for chunk to disk

docbin size for this project:  1000 
Please take note of the docbin size if you want to come back to your project and not process the data again.
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_0.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_1.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_2.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_3.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_4.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_5.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_6.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_7.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_8.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_9.db
Saving chunk as:  ../../intermediate_data/anxiety_docbins/anxiety_docbin_1

## Frequency

In [30]:
# before we can do anything, we need to get a list of all the names of the docbins, in order, thats what happens here

docbin_folder = SAVEDIR + "/*.db" # you might have to change this if you use this notebook for a different folder
docbins = [docbin for docbin in glob.iglob(docbin_folder)]
DOCBINS = list()
for i in range(0, len(docbins), 1):
    db_path = DOCBIN_FILENAME_PATTERN + "_docbin_" + str(i) + ".db" # here the naming pattern of the docbins is hard coded, so you may have to change this if you apply it to another project
    DOCBINS.append(db_path)

In [31]:
# Let's do the counting!

subtotals = [docbin_counter(docbin, nlp) for docbin in DOCBINS] # we will apply a counting function to each docbin here
total = Counter() # we set up a blank counter which will consolidate all the docbin-level totals
for subtotal in subtotals: # this loop does the counting
    total.update(subtotal)
fdist = fdist2table(total, savename = "../../output_data/words_frequency_experience.xlsx") # save it to excel, you can make the filename whatever you want

Frequency distribution saved!


In [22]:
fdist.query("word == 'stress'")

Unnamed: 0,word,label,count
11,stress,NOUN,1072
1389,stress,VERB,247
26236,stress,ADJ,1


In [None]:
# show top 20 NOUN, VERB, ADJ in a table
fdist.query("label == 'VERB'").sort_values("count", ascending = False).head(20)

## Compare professionals vs non-professional Frequency

In [14]:
prof_count_df = pd.read_excel("../../output_data/words_frequency_professional.xlsx")[['word', 'label', 'count']]\
.rename(columns = {'count': 'count_prof'})
exp_count_df = pd.read_excel("../../output_data/words_frequency_experience.xlsx")[['word', 'label', 'count']]\
.rename(columns = {'count': 'count_amat'})

tot_count_df = prof_count_df.merge(exp_count_df, left_on = ['word', 'label'], right_on = ['word', 'label'])

In [12]:
professional_words = ['illness',
                 'treatement',
                 'disease',
                 'analysis',
                 'cure',
                 'hospitalization',
                 'medication',
                 'medicine',
                 'operation',
                 'prescription',
                 'healing',
                 'therapy',
                 'knowledge',
                 'learn',
                 'psichologist'     
                ]

experience_words = ['life',
                 'growth',
                 'love',
                 'comprehension',
                 'judgement',
                 'inspiration',
                 'gratitude',
                 'story',
                 'habit'
                 'day',
                 'selfcare',
                 'happiness'
                ]
                 
                 

In [17]:
tot_count_df[tot_count_df.word.isin(experience_words)].sort_values(by=['word', 'label'])

Unnamed: 0,word,label,count_prof,count_amat
2778,comprehension,NOUN,7,10960
2624,gratitude,NOUN,156,6450
4081,gratitude,VERB,48,511
2665,growth,NOUN,173,15883
2628,happiness,NOUN,256,6796
294,inspiration,NOUN,434,6106
2027,judgement,NOUN,128,16372
4993,judgement,PROPN,3,2
4575,judgement,VERB,4,10
530,life,NOUN,3826,19649


In [76]:
exp_count_df.sort_values(by ='count_exp',ascending = False)

Unnamed: 0,word,label,count_exp
63,life,NOUN,19649
5,anxiety,NOUN,17477
11538,judgement,NOUN,16372
2640,growth,NOUN,15883
12694,⁣,NOUN,13100
...,...,...,...
17303,𝐬𝐨𝐦𝐞,PROPN,1
17304,⁣\nyour gut produce up to,VERB,1
17305,olympic,ADJ,1
17306,Googling,PROPN,1


## Collocation

In [None]:
cl_df = collocator_main(
    
    ("neurotransmitter", "NOUN"),   ## you can change this, but pay attention to the format!
    DOCBINS, 
    nlp, 
    total, 
    window_size = 3,   ## you can change this, we will discuss this in class (window size = 2 -> just next to the one we have)
    remove_stopwords = True ## option for advanced use
    
)

# this will look a little clunky, and as you can see, some minor errors need fixing; this is the most complicated computation.

cl_df.to_excel("../../output_data/words_collocation.xlsx", index=True, engine = "xlsxwriter")

## Concordance

In [None]:
all_hits = concordancer(
    DOCBINS,
    ("anxious","ADJ"),
    5, # this is how on either side to look for (window size)
    nlp,
    sample_size = 20,
    label = "anxious|ADJ"
)

#### Observe: apparently most keywords are only used as hashtgas

## Keyness

In [None]:
# To compute keyness, first we need a frequency distribution of a subset of our posts to compare with the total.

keyness_fdist = sliced_docbin_word_counter(
    
    DOCBINS,
    df,
    nlp,
    slice_value = "healthanxietycoach", # we are slicing on a value, so here we can put in the username we want
    slice_variable = "author", # this tells us which column of our data table to find the value above
    remove_stopwords = True,
    docbin_size = DOCBIN_SIZE
    
)

In [None]:
# Then it is just a matter of statistics! We will use a Chi-Squared statistic and the PDIFF statistic proposed by Gabrielatos (2018)

kn_df = keyness_chi_sq(keyness_fdist, total, savename = "../../output_data/keywords_chisq.xlsx")
kn_df = keyness_pdiff(keyness_fdist, total, savename = "../../output_data/keywords_pdiff.xlsx")

In [None]:
kn_df.sort_values("pdiff", ascending=False).head(20)

## Hashtag and Social Media Noise

In [None]:
ht_fdist = hashtag_counter(documents, savename = "../../output_data/hashtags.xlsx") # this takes care of everything and makes an excel spreadsheet