In [None]:
# mount drive to access data
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# change path HERE to reflect your folder structure
my_path = '/content/drive/MyDrive/suicide-project/data/'

In [None]:
# create dataframe from csv data file

import pandas as pd
from ast import literal_eval
df = pd.read_csv(my_path + 'PROCESSED_TEXT_CORPUS.csv', usecols = ['date','post_id','without_stopwords_body'],
                 converters={'without_stopwords_body': literal_eval})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337982 entries, 0 to 1337981
Data columns (total 3 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   post_id                 1337982 non-null  int64 
 1   date                    1337982 non-null  object
 2   without_stopwords_body  1337982 non-null  object
dtypes: int64(1), object(2)
memory usage: 30.6+ MB


In [None]:
df.head(2)

Unnamed: 0,post_id,date,without_stopwords_body
0,1742806,"Oct 6, 2022","[one, nice, thing]"
1,1742807,"Oct 6, 2022","[care, well, beings, others]"


In [None]:
# lemmatization using nltk/wordnet
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
df['lemmatized_body'] = df['without_stopwords_body'].apply(lambda x: ' '.join([wordnet_lem.lemmatize(word) for word in x]))
df.head(2)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,post_id,date,without_stopwords_body,lemmatized_body
0,1742806,"Oct 6, 2022","[one, nice, thing]",one nice thing
1,1742807,"Oct 6, 2022","[care, well, beings, others]",care well being others


In [None]:
# sentiment analysis using vader-lexicon (nltk)
import nltk
nltk.download('vader_lexicon')
from nltk.sentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to /root/nltk_data...


In [None]:
df['polarity'] = df['lemmatized_body'].apply(lambda x: analyzer.polarity_scores(x))

In [None]:
df = pd.concat([df.drop(['polarity'], axis=1), df['polarity'].apply(pd.Series)], axis=1)
df['sentiment'] = df['compound'].apply(lambda x: 'positive' if x > 0 else 'neutral' if x == 0 else 'negative')
df.head(3)

Unnamed: 0,post_id,date,without_stopwords_body,lemmatized_body,neg,neu,pos,compound,sentiment
0,1742806,"Oct 6, 2022","[one, nice, thing]",one nice thing,0.0,0.417,0.583,0.4215,positive
1,1742807,"Oct 6, 2022","[care, well, beings, others]",care well being others,0.0,0.274,0.726,0.6486,positive
2,1742808,"Oct 6, 2022","[arent, karens]",arent karen,0.0,1.0,0.0,0.0,neutral


In [None]:
df.drop(['without_stopwords_body','pos','neu','neg'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337982 entries, 0 to 1337981
Data columns (total 5 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   post_id          1337982 non-null  int64  
 1   date             1337982 non-null  object 
 2   lemmatized_body  1337982 non-null  object 
 3   compound         1337982 non-null  float64
 4   sentiment        1337982 non-null  object 
dtypes: float64(1), int64(1), object(3)
memory usage: 51.0+ MB


In [None]:
df.to_csv(my_path+'sentiment.csv', sep=',', header=True, index=False)

In [None]:
df['date_n'] = pd.to_datetime(df['date'])    # converting string type to datetime object
list_year = [i.split(" ")[0][0:4] for i in list(df['date_n'].astype(str))]
list_month = [i.split(" ")[0][5:7] for i in list(df['date_n'].astype(str))]
df['year'] = list_year
df['month'] = list_month
df.drop(['date_n'], axis=1, inplace=True)

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337982 entries, 0 to 1337981
Data columns (total 7 columns):
 #   Column           Non-Null Count    Dtype  
---  ------           --------------    -----  
 0   post_id          1337982 non-null  int64  
 1   date             1337982 non-null  object 
 2   lemmatized_body  1337982 non-null  object 
 3   compound         1337982 non-null  float64
 4   sentiment        1337982 non-null  object 
 5   year             1337982 non-null  object 
 6   month            1337982 non-null  object 
dtypes: float64(1), int64(1), object(5)
memory usage: 71.5+ MB


In [None]:
df.drop(['post_id','date','lemmatized_body','compound'], axis=1, inplace=True)

In [None]:
dfcounts = df.groupby(['year','month', 'sentiment']).size().reset_index()
dfcounts

Unnamed: 0,year,month,sentiment,0
0,2018,03,negative,106
1,2018,03,neutral,20
2,2018,03,positive,117
3,2018,04,negative,654
4,2018,04,neutral,164
...,...,...,...,...
163,2022,09,neutral,3759
164,2022,09,positive,13005
165,2022,10,negative,2313
166,2022,10,neutral,762


In [None]:
dfcounts.to_csv(my_path+'sentiment_all_month_year.csv', sep=',', header=True, index=False)