## Import libraries

In [7]:
import os
import pandas as pd
from drs_corpora import * #import helper functions
nlp = spacy.load("en_core_web_sm") # load the spaCy model

## Set parameters

In [2]:
INNTERMEDIATE_DATA = "intermediate_data" #we split up all the documents into a bunch of ".db" files, or "docbin" files that hold our text data.
DOCBIN_SIZE = None
SAVEDIR = "../" + INNTERMEDIATE_DATA

try:
    os.mkdir(SAVEDIR)
    print("Directory created. Path to data folder:", SAVEDIR)
except FileExistsError:
    print("Path to data folder:", SAVEDIR)
DOCBIN_FILENAME_PATTERN = SAVEDIR + "/" + INNTERMEDIATE_DATA + "_docbin"

Path to data folder: ../intermediate_data


## Read Raw Data

In [20]:
{profiles_category_dict = 

{"anxiety.positive": "experience",
 "anxiety_fightermum": "experience",
 "your.anxious.therapist": "professional"
,"drkirren": "professional"
,"drjulie":	"professional"
,"the.anxious.truth": "professional
,"cherellethinks": "professional"
,"theanxietyhealer": "professional"
,"myeasytherapy": "professional"
,"dearmyanxiety": "experience"
,"anxietyjosh": "experience"
,"healyournervoussystem": "professional"
,"thehealingtherapist": "professional"
,"anxiety_fitness": "experience"
,"healthanxietycoach": "professional"
,"ahealthypush": "professional"
,"_peacefromwithin": "professional"
,"honestlyholistic": "experience"
,"health_anxiety": "experience"
,"zowhy_coaching": "experience"
,"becdiekman": "experience"
}
}

SyntaxError: invalid syntax (2385565993.py, line 1)

In [15]:
# Specify the path to the folder containing CSV files
folder_path = '../input_data/'
json_name = 'profiles_category.json'

df = pd.read_json(folder_path + json_name, lines=True)#.T.reset_index()


ValueError: Expected object or value

In [10]:
# Specify the path to the folder containing CSV files
folder_path = '../input_data/'

# Initialize an empty list to store the DataFrames
dfs = []

# Loop through each file in the folder
for filename in os.listdir(folder_path):
    if filename.endswith(".csv"):
        # Construct the full path to the CSV file
        file_path = os.path.join(folder_path, filename)
        
        # Read the CSV file into a DataFrame
        df = pd.read_csv(file_path)
        
        # Append the DataFrame to the list
        dfs.append(df)

# Concatenate all DataFrames into a single DataFrame
df = pd.concat(dfs, ignore_index=True)

print(len(df))
df = df[df["body"].isna() == False]
documents = list(df["body"])
print(len(df))

26791
26692


## Tokenization, Lemmatization, Named Entity Recognition, and POS Tagging

In [4]:
%%time
nlp.add_pipe("merge_entities") # this makes the tokens cleaner later

N_PROCESS = 10 # you can change this depending on how many cores your CPU has (eg. a Mac M1 has 8 cores, so you can use up to 7 here)
BATCH_SIZE = 100 # this depends on how much RAM you have. If this process hangs or crashes, you may want to reduce batch size (how many docs each core will process in one chunk)
DOCBIN_SIZE = N_PROCESS * BATCH_SIZE

print("docbin size for this project: ", str(DOCBIN_SIZE), "\nPlease take note of the docbin size if you want to come back to your project and not process the data again.")

for i, chunk in enumerate(chunker(documents, DOCBIN_SIZE)): # chopping our dataset into chunks. We don't need this in our toy example, but we do for large datsets; change from 100 if you want
    doc_bin = DocBin(store_user_data = True) # create a docbin for our chunk
    for doc in nlp.pipe(chunk, n_process = N_PROCESS, batch_size = BATCH_SIZE): # process our documents, you can play with n_process and batch_size depending on your CPU and RAM
        doc_bin.add(doc) # save the document to our docbin
    chunk_name = DOCBIN_FILENAME_PATTERN + "_docbin_" + str(i) + ".db" # make a nice filename for each chunk
    print("Saving chunk as: ", chunk_name) # display progress
    doc_bin.to_disk(chunk_name) # save docbin for chunk to disk

docbin size for this project:  1000 
Please take note of the docbin size if you want to come back to your project and not process the data again.
Saving chunk as:  ../intermediate_data/intermediate_data_docbin_docbin_0.db
Saving chunk as:  ../intermediate_data/intermediate_data_docbin_docbin_1.db
Saving chunk as:  ../intermediate_data/intermediate_data_docbin_docbin_2.db
Saving chunk as:  ../intermediate_data/intermediate_data_docbin_docbin_3.db
Saving chunk as:  ../intermediate_data/intermediate_data_docbin_docbin_4.db
CPU times: user 13.6 s, sys: 2.58 s, total: 16.2 s
Wall time: 1min 14s


## Frequency

In [5]:
# before we can do anything, we need to get a list of all the names of the docbins, in order, thats what happens here

docbin_folder = SAVEDIR + "/*.db" # you might have to change this if you use this notebook for a different folder
docbins = [docbin for docbin in glob.iglob(docbin_folder)]
DOCBINS = list()
for i in range(0, len(docbins), 1):
    db_path = DOCBIN_FILENAME_PATTERN + "_docbin_" + str(i) + ".db" # here the naming pattern of the docbins is hard coded, so you may have to change this if you apply it to another project
    DOCBINS.append(db_path)

In [6]:
# Let's do the counting!

subtotals = [docbin_counter(docbin, nlp) for docbin in DOCBINS] # we will apply a counting function to each docbin here
total = Counter() # we set up a blank counter which will consolidate all the docbin-level totals
for subtotal in subtotals: # this loop does the counting
    total.update(subtotal)
fdist = fdist2table(total, savename = "../output_data/words_frequency.xlsx") # save it to excel, you can make the filename whatever you want

Frequency distribution saved!


In [23]:
# show top 20 NOUN, VERB, ADJ in a table
fdist.query("label == 'ADJ'").sort_values("count", ascending = False).head(20)

Unnamed: 0,word,label,count
19,#grateful #,ADJ,6196
93,✨,ADJ,283
1505,thinkpositive,ADJ,217
2227,authentic,ADJ,201
1600,tiredmum,ADJ,134
1738,controlwhatyoucan,ADJ,112
1826,showup,ADJ,107
86,great,ADJ,103
2169,ichoose,ADJ,101
351,good,ADJ,100


## Collocation

In [8]:
cl_df = collocator_main(
    
    ("life", "NOUN"),   ## you can change this, but pay attention to the format!
    DOCBINS, 
    nlp, 
    total, 
    window_size = 3,   ## you can change this, we will discuss this in class (window size = 2 -> just next to the one we have)
    remove_stopwords = True ## option for advanced use
    
)

# this will look a little clunky, and as you can see, some minor errors need fixing; this is the most complicated computation.

cl_df.to_excel("../output_data/words_collcation.xlsx", index=True, engine = "xlsxwriter")

Error encountered, printing contingency table values...
 A: 5128  B: 12013  C: 0  D: 257300
Error encountered, printing contingency table values...
 A: 2274  B: 14867  C: 0  D: 257300
Error encountered, printing contingency table values...
 A: 13  B: 0  C: 17128  D: 257300
Error encountered, printing contingency table values...
 A: 15  B: 0  C: 17126  D: 257300
Error encountered, printing contingency table values...
 A: 251  B: 16890  C: 0  D: 257300
Error encountered, printing contingency table values...
 A: 13  B: 0  C: 17128  D: 257300
Error encountered, printing contingency table values...
 A: 33  B: 17108  C: 0  D: 257300
Error encountered, printing contingency table values...
 A: 8  B: 0  C: 17133  D: 257300


## Concordance

In [24]:
all_hits = concordancer(
    DOCBINS,
    ("comprehension","NOUN"),
    5, # this is how on either side to look for (window size)
    nlp,
    sample_size = 20,
    label = "comprehension|NOUN"
)

TypeError: bad operand type for unary +: 'str'

#### Observe: apparently most keywords are only used as hashtgas

## Keyness

In [None]:
# To compute keyness, first we need a frequency distribution of a subset of our posts to compare with the total.

keyness_fdist = sliced_docbin_word_counter(
    
    DOCBINS,
    df,
    nlp,
    slice_value = "anxiety_fightermum", # we are slicing on a value, so here we can put in the username we want
    slice_variable = "author", # this tells us which column of our data table to find the value above
    remove_stopwords = True,
    docbin_size = DOCBIN_SIZE
    
)

In [None]:
# Then it is just a matter of statistics! We will use a Chi-Squared statistic and the PDIFF statistic proposed by Gabrielatos (2018)

kn_df = keyness_chi_sq(keyness_fdist, total, savename = "../output_data/keywords_chisq.xlsx")
kn_df = keyness_pdiff(keyness_fdist, total, savename = "../output_data/keywords_pdiff.xlsx")

In [None]:
kn_df.sort_values("pdiff", ascending=False).head(20)

## Hashtag and Social Media Noise

In [None]:
ht_fdist = hashtag_counter(documents, savename = "../output_data/hashtags.xlsx") # this takes care of everything and makes an excel spreadsheet