In [None]:
REDDIT POST DEPRESSION SENTIMENT ANALYSIS

* TFIDF (information extraction)
* VADER
* TextBlob
* HuggingFace

import numpy as np #linear
import pandas as pd #data processing
from sklearn import preprocessing
import heapq #heap based priority queue implementattion
from collections import Counter #special contained datatypes, 
# Counter: Dictionary subclass for counting hashable objects. 
#It provides a convenient way to count the occurrences of elements in a collection.
import nltk
nltk.download('stopwords')
nltk.download('punkt')
from nltk.corpus import stopwords #access to large bodies of text for classification, etc
from nltk.tokenize import word_tokenize #word tokenizer
from nltk.tokenize.treebank import TreebankWordDetokenizer #for converting tokens back into text
#importing the dataset
import os
df = pd.read_csv("depression_dataset_reddit_cleaned.csv")

Bag of Words representation of data: unordered collection but keeping freqeuncy of words

1) tokeninze (remove stop words before)
2) unique words list (2, one for depressed and non-depressed)
3) word occurences in unique word list
4) vectorize

#remove stop words
s_words = set(stopwords.words('english')) #gets list of stopwords, set converts into set
rows = df.shape[0]
#remove stop words in each row
cleaned = []
for i in range(rows):
    tokens = word_tokenize(df.clean_text[i]) #tokenize, and create new column
    #remove stopwords
    for x in tokens:
        if x not in s_words:
            cleaned.append(x)
    
    df.clean_text[i] = TreebankWordDetokenizer().detokenize(cleaned) #detokenizes cleaned sentences into original text and into the new_text column
    cleaned.clear()

#find word frequency
d_freq = {}
nd_freq = {}
for i in range(rows):
    tokens = word_tokenize(df.clean_text[i])
    for x in tokens:
        if df.is_depression[i] == 1:
            d_freq[x] = 1 if x not in d_freq else d_freq[x] + 1
        else:
            nd_freq[x] = 1 if x not in nd_freq else nd_freq[x] + 1
            

d_words = heapq.nlargest(100, d_freq, key=d_freq.get) #gets top 100 words  from d_freq based on frequency
nd_words = heapq.nlargest(100, nd_freq, key=nd_freq.get)
d_most_f = {}
nd_most_f = {}
#iterate over each queue and assign the frequency from the queue to the dictionary
for j in d_words:
    d_most_f[j] = d_freq[j]
for j in nd_words:
    nd_most_f[j] = nd_freq[j]


total_f = Counter(d_most_f) + Counter(nd_most_f)
#total_f

create arrays for each row in the data based on each word frequency in the data

f_vec = []
for i in range(rows):
    tokens = word_tokenize(df.clean_text[i])
    s_vec = []
    for x in total_f:
        if x in tokens:
            #add number of times the word appears in the sentence
            count = 0
            for j in range(len(tokens)):
                if tokens[j] == x:
                    count += 1
            s_vec.append(count)
        else:
            s_vec.append(0)
    f_vec.append(s_vec)

IDF: emphasize words that are rare across the entire corpus but occur frequently in specific documents

inv_f = []
for x in total_f:
    for i in range(rows):
        data = 0
        tokens = word_tokenize(df.clean_text[i])
        if x in tokens:
            data += 1
    idf = np.log((1+rows)/(1+data)) + 1
    inv_f.append(idf)
    
# idf = ln(1+n/1+df(t)) + 1
#IDF(t)=log(N/(df(t) + 1))
# n = total number of rows
# df(t) # of documents in set that have the term "t"

print(len(total_f))
print(len(f_vec))
print(len(inv_f))

#ifidf w/ normalization
tf = np.array(f_vec)
idf_ = np.array(inv_f)
tfidf = tf*idf_
#normalize data points (0, 1)
scale = preprocessing.MinMaxScaler() #scales value to 0 and 1
scale.fit(tfidf) #scales each feature to fit between 0 and 1
tfidf = scale.transform(tfidf) #each feature in tfidf is scaled to 0 and 1
tfidf

#print(total_f)
#print(tf[0])
#print(df.clean_text[0])
#print(tfidf[:,1])

# iterates over the words in the total_f list and adds columns to the 
#DataFrame df based on the word frequency in the TF-IDF matrix tfidf.

#add columns based on word frequency (total_f)
y = 0
for x in total_f:
    df.loc[:, x] = tfidf[:,y]
    y += 1
    
#df will have additional columns corresponding to each word in the total_f list,
#with each column containing the TF-IDF values for that word across all 
#documents in the dataset. 
df

making new csv file

df.to_csv("tfdif.csv", index=False)
#most frequent depressed and non depressed words
pd.DataFrame(d_words).to_csv("100_frequent_depressed.csv", index=False)
pd.DataFrame(nd_words).to_csv("100_frequent_n_depressed.csv", index=False)

df_2 = pd.read_csv('tfdif.csv')
df_1 = pd.read_csv('depression_dataset_reddit_cleaned.csv')
df_1.head()

#user vader for sentiment analysis on original data

pip install vaderSentiment

#lower score = negative sentiment

from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
sentiment = SentimentIntensityAnalyzer()
score = sentiment.polarity_scores(df_1.clean_text[50])
print(score['compound'])
#compound = overall senitment polarity

#split training dataset into traain and test datasets

from sklearn.model_selection import train_test_split as tts

#train is the data (posts) to train on (the clean_text, not the altered text because that contains stopwords)

train, test = tts(df_1, test_size=0.33, random_state=42)
train = train.reset_index(drop=True) #resetting the indexes to make sure there isnt a key error in the following cell
train.index

#test.drop(columns=['is_depression'], inplace=True)
#test.head()

#polarity score
num_rows = train.shape[0]
score_vec = []
for i in range(num_rows):
    score = sentiment.polarity_scores(train.clean_text[i])
    score_vec.append(score['compound'])

score_vec[:5]

train.loc[:, 'sentiment'] = score_vec
train.head()

#average, min, and std of sentiment for depressed and non depressed  posts

avg = train.groupby(['is_depression'], as_index=False)['sentiment'].mean()
print("average")
avg

standard = train.groupby(['is_depression'], as_index=False)['sentiment'].std()
print("standard deviation")
standard

#average sentiment is lower for 'depressed' posts
#the std is greater which mean there is more variance in sentiment in the 'depressed' posts
# what could this mean?

Using TextBlob

!pip install -U textblob
!python -m textblob.download_corpora

from textblob import TextBlob as tb

#split the dataset again

tb_train, tb_test = tts(df_1, test_size = 0.33, random_state = 39)
tb_train = tb_train.reset_index(drop=True) #resetting the indexes to make sure there isnt a key error in the following cell
tb_train.index

tb_train.head()

sc = tb(tb_train.clean_text[1500])
print(sc.sentiment[0])

num_rows_2 = tb_train.shape[0]
score_vec_2 = []
for i in range(num_rows_2):
    sc = tb(tb_train.clean_text[i])
    score_vec_2.append(sc.sentiment[0])
score_vec_2[:5]

tb_train.loc[:,'sentiment'] = score_vec_2
tb_train

average = tb_train.groupby(['is_depression'], as_index=False)['sentiment'].mean()
print("average")
average

stand = tb_train.groupby(['is_depression'], as_index=False)['sentiment'].std()
print("standard deviation")
stand

#TextBlob gives us a higher sentiment for the depressed posts
#than the non-depressed posts.
#this probably means Vader is a more accurate in this case

