In [None]:
# mount drive to access data
from google.colab import drive
drive.mount('/content/drive/')

Mounted at /content/drive/


In [None]:
# change path HERE to reflect your folder structure
my_path = '/content/drive/MyDrive/suicide-project/data/'

In [None]:
# create dataframe from csv data file

import pandas as pd
from ast import literal_eval
df = pd.read_csv(my_path + 'PROCESSED_TEXT_CORPUS.csv', usecols = ['post_id','date','post_text','without_stopwords_body'],
                 converters={'without_stopwords_body': literal_eval})
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337982 entries, 0 to 1337981
Data columns (total 4 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   post_id                 1337982 non-null  int64 
 1   post_text               1329200 non-null  object
 2   date                    1337982 non-null  object
 3   without_stopwords_body  1337982 non-null  object
dtypes: int64(1), object(3)
memory usage: 40.8+ MB


In [None]:
df.head(2)

Unnamed: 0,post_id,post_text,date,without_stopwords_body
0,1742806,just one nice thing.,"Oct 6, 2022","[one, nice, thing]"
1,1742807,they care for the well beings of others.,"Oct 6, 2022","[care, well, beings, others]"


In [None]:
# lemmatization using nltk/wordnet
import nltk
nltk.download('wordnet')
nltk.download('omw-1.4')
from nltk.stem import WordNetLemmatizer
wordnet_lem = WordNetLemmatizer()
df['lemmatized_body'] = df['without_stopwords_body'].apply(lambda x: ' '.join([wordnet_lem.lemmatize(word) for word in x]))
df.head(2)

[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data] Downloading package omw-1.4 to /root/nltk_data...


Unnamed: 0,post_id,post_text,date,without_stopwords_body,lemmatized_body
0,1742806,just one nice thing.,"Oct 6, 2022","[one, nice, thing]",one nice thing
1,1742807,they care for the well beings of others.,"Oct 6, 2022","[care, well, beings, others]",care well being others


In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337982 entries, 0 to 1337981
Data columns (total 5 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   post_id                 1337982 non-null  int64 
 1   post_text               1329200 non-null  object
 2   date                    1337982 non-null  object
 3   without_stopwords_body  1337982 non-null  object
 4   lemmatized_body         1337982 non-null  object
dtypes: int64(1), object(4)
memory usage: 51.0+ MB


In [None]:
df.drop(['without_stopwords_body'], axis=1)
df.info()
del df['without_stopwords_body']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337982 entries, 0 to 1337981
Data columns (total 5 columns):
 #   Column                  Non-Null Count    Dtype 
---  ------                  --------------    ----- 
 0   post_id                 1337982 non-null  int64 
 1   post_text               1329200 non-null  object
 2   date                    1337982 non-null  object
 3   without_stopwords_body  1337982 non-null  object
 4   lemmatized_body         1337982 non-null  object
dtypes: int64(1), object(4)
memory usage: 51.0+ MB
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337982 entries, 0 to 1337981
Data columns (total 4 columns):
 #   Column           Non-Null Count    Dtype 
---  ------           --------------    ----- 
 0   post_id          1337982 non-null  int64 
 1   post_text        1329200 non-null  object
 2   date             1337982 non-null  object
 3   lemmatized_body  1337982 non-null  object
dtypes: int64(1), object(3)
memory usage: 40.8+ MB


In [None]:
# substance dictionary
substance_dict={}
substance_dict['sn'] = ['sn', 'sni', 'sodium nitrite', 'sodium nitrate', 'nano3','nano2',
                        'snit','nitrite', 'nitrate','nitrit','nitrites','snwhat']
substance_dict['acid regulator'] = ['acid reducer','acid regulator', 'anti acid','antacid',
                                    'tagamet','tagmet','tagament','cimetidine']
substance_dict['hanging'] = ['hanging','hang','full suspension','hanged']
substance_dict['ricin'] = ['ricin']
substance_dict['plant-based poisons'] = ['yew','datura','apocynaceae','pong pong','othalanga','cerberin']
substance_dict['antiemetic'] = ['anti emetic','antiemetic']
substance_dict['nitric oxide'] = ['nitric oxide']
substance_dict['cyanides'] = ['potassium cyanide','sodium cyanide', 'cyanide', 'apple seeds', 'amygdalin',
                              'kcn','hcn','hydrogen cyanide']
substance_dict['household chemicals'] = ['pesticide','drain cleaner','rat poison','butanediol']
substance_dict['barbiturates'] = ['pentobarbital sodium','nembutal','sodium thiopental']
substance_dict['other preservatives'] = ['azide','sodium azide','curing salts', 'curing salt','curing meat','meat curing']
substance_dict['firearm'] = ['gun', 'firearm', 'firearms', 'handgun', 'shotgun', 'pistol', 'rifle', 'revolver', 'pistols',
                             'shotguns', 'guns', 'handguns', 'glock','magnum','shoot']
substance_dict['benzodiazepines']= ['alprazolam','clonazepam','diazepam','triazolam',
                               'lorazepam','midazolam','temazepam','oxazepam', 'nimetazepam', 'norflurazepam','metizolam','nitrazolam','adinazolam',
                           'phenazepam', 'pyrazolam', 'flualprazolam', 'flunitrazepam', 'flunitrazolam',
                           'bromazolam', 'nitrazepam','clonazolam', 'flubromazolam','bromazepam',
                           'etizolam','diclazepam','flubromazepam', 'benzos', 'benzodiazepine','benzo',
                                    'benzodiazepine']
substance_dict['opioids'] = ['hyrdrocodone', 'hydrocodiene', 'hydro_codone', 'hydrocodene', 'hydrocordone',
                             'hydorcodone', 'hydrocodeine', 'hidrocodone', 'hyrocodone', 'hydrocone', 'hydrodone',
                             'hyrdocodone', 'hydrcodone', 'hydrocodones', 'hydrocordisone', 'hydocodone', 'hydrocode',
                             'hydrocodons', 'hydracodone', 'hydrocodone', 'hydrocodin', 'hydrocodine', 'hydrocodon',
                             'hydrocondone', 'hydrocodne', 'hydrocdone', 'hydros', '357s', 'lortab', 'lortabs',
                             'lorcet', 'lorcets', 'lortab', 'norco', 'vicoden', 'vicadin', 'viodin', 'vicodin',
                             'vicodines', 'vicodan', 'vicodien', 'viccodin', 'vocodin', 'vicondin', 'vicoding',
                             'vicodins', 'vicodon', 'vicidin', 'vidodin', 'vikodin', 'viacodin', 'vicodine', 'vicdin',
                             'vicotin', 'percoset', 'percocets', 'percocett', 'pecocets', 'percocoet', 'percocit',
                             'percet', 'percoct', 'percocet10', 'percicet', 'percocetes', 'percecet', 'percocet',
                             'oxocodone', 'oxycodene', 'oxycondone', 'oycodone', 'oxyxodone', 'oxycodones', 'oxicodone',
                             'oxy_codone', 'oxycodone', 'oxycodine', 'roxycodone', 'ocycodone', 'oxycodons', 'oxcodone',
                             'oycondone', 'oxycodon', 'oyxcodone', 'oxcycodone', 'oxycodne', 'oxy', 'oxys', 'roxy',
                             'roxies', 'roxicodone', 'oc', 'percs', 'm30', 'm30s', 'dilaudid', 'hydromorphone',
                             'oxymorphone', 'ocycontin', 'oxcontin', 'oxcotin', 'oxcycontin', 'oxycotine', 'oxycontin',
                             'roxycontin', 'oycotin', 'oxyconton', 'oxycontine', 'oxycotins', 'oxycontin', 'oxycintin',
                             'oxy_contin', 'oxicontin', 'oxycontins', 'oxycottin', 'oycontin', 'morphin', 'morfin',
                             'morphs', 'tramadol', 'trmadol', 'tramdol', 'tramadol', 'fentinyl', 'fentenyl',
                             'fenanyl', 'fentanly', 'fentnyal', 'fentanol', 'fental', 'fetanyl', 'fentayl',
                             'fentanayl', 'fentanyl', 'fentyl', 'fentanal', 'fetnyl', 'fentynyl', 'fentnayl',
                             'fentanl', 'fentyanl', 'fentonyl', 'fentanyal', 'fentany', 'fentnyl', 'fent', 'fents',
                             'carfentanil', 'carfentanyl', 'carfent', 'heroin', 'herroin', 'herioin', 'heroins',
                             'sufentanyl', 'sufentanil']

all_keywords = []
for lis in substance_dict.values():
  all_keywords = all_keywords+lis

print(len(all_keywords))
all_keywords

256
256


['sn',
 'sni',
 'sodium nitrite',
 'sodium nitrate',
 'nano3',
 'nano2',
 'snit',
 'nitrite',
 'nitrate',
 'nitrit',
 'nitrites',
 'snwhat',
 'acid reducer',
 'acid regulator',
 'anti acid',
 'antacid',
 'tagamet',
 'tagmet',
 'tagament',
 'cimetidine',
 'hanging',
 'hang',
 'full suspension',
 'hanged',
 'ricin',
 'yew',
 'datura',
 'apocynaceae',
 'pong pong',
 'othalanga',
 'cerberin',
 'anti emetic',
 'antiemetic',
 'nitric oxide',
 'potassium cyanide',
 'sodium cyanide',
 'cyanide',
 'apple seeds',
 'amygdalin',
 'kcn',
 'hcn',
 'hydrogen cyanide',
 'pesticide',
 'drain cleaner',
 'rat poison',
 'butanediol',
 'pentobarbital sodium',
 'nembutal',
 'sodium thiopental',
 'azide',
 'sodium azide',
 'curing salts',
 'curing salt',
 'curing meat',
 'meat curing',
 'gun',
 'firearm',
 'firearms',
 'handgun',
 'shotgun',
 'pistol',
 'rifle',
 'revolver',
 'pistols',
 'shotguns',
 'guns',
 'handguns',
 'glock',
 'magnum',
 'shoot',
 'alprazolam',
 'clonazepam',
 'diazepam',
 'triazolam',


In [None]:
df['keywords_matched'] = ''
df['group_name'] = ''

In [None]:
df['lemmatized_body'].head(2)

0            one nice thing
1    care well being others
Name: lemmatized_body, dtype: object

In [None]:
def string_found(string1, string2):
  string1 = " " + string1.strip() + " "
  string2 = " " + string2.strip() + " "
  return string2.find(string1)

for i in range(len(df)):
  curr_sentence = df['lemmatized_body'][i]  # this should be str type
  matches = [ele for ele in all_keywords if(string_found(ele,curr_sentence)>0)]

  if(matches):
    df['keywords_matched'][i] = matches
    group_name = []
    for j in range(len(matches)):
      dict_group_value = [key for key, list_of_values in substance_dict.items() if matches[j] in list_of_values][0]

      if(dict_group_value!=None):
        group_name.append(dict_group_value)

    df['group_name'][i] = list(set(group_name))

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['keywords_matched'][i] = matches
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df['group_name'][i] = list(set(group_name))


In [None]:
# find month + year associated with each post
df['date_n'] = pd.to_datetime(df['date'])    # converting string type to datetime object
list_year = [i.split(" ")[0][0:4] for i in list(df['date_n'].astype(str))]
list_month = [i.split(" ")[0][5:7] for i in list(df['date_n'].astype(str))]
df['year'] = list_year
df['month'] = list_month

In [None]:
df.drop(['date_n'], axis=1)
del df['date_n']
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337982 entries, 0 to 1337981
Data columns (total 8 columns):
 #   Column            Non-Null Count    Dtype 
---  ------            --------------    ----- 
 0   post_id           1337982 non-null  int64 
 1   post_text         1329200 non-null  object
 2   date              1337982 non-null  object
 3   lemmatized_body   1337982 non-null  object
 4   keywords_matched  1337982 non-null  object
 5   group_name        1337982 non-null  object
 6   year              1337982 non-null  object
 7   month             1337982 non-null  object
dtypes: int64(1), object(7)
memory usage: 81.7+ MB


In [None]:
df2 = df.drop(['post_id','post_text','date','lemmatized_body','keywords_matched','group_name'], axis=1)
df2.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1337982 entries, 0 to 1337981
Data columns (total 2 columns):
 #   Column  Non-Null Count    Dtype 
---  ------  --------------    ----- 
 0   year    1337982 non-null  object
 1   month   1337982 non-null  object
dtypes: object(2)
memory usage: 20.4+ MB


In [None]:
df = df.loc[~(df['group_name']=='')] # posts which did not contain keywords are discarded from further analysis
df.reset_index(drop = True,inplace = True)

In [None]:
len(df)

128759

In [None]:
# To track frequency of each group
frequency_dict = {'sn':0,'hanging':0,'firearm':0, 'benzodiazepines':0,'opioids':0,'antiemetic':0,'barbiturates':0,'acid regulator':0,'cyanides':0,'other preservatives':0,'plant-based poisons':0,'household chemicals':0, 'ricin':0,'nitric oxide':0}

def count_unique(lis):
  global frequency_dict

  for i in range(len(lis)):
    frequency_dict[lis[i]] = frequency_dict[lis[i]]+1
  return

df.apply(lambda x: count_unique(x['group_name']),axis=1)

0         None
1         None
2         None
3         None
4         None
          ... 
128754    None
128755    None
128756    None
128757    None
128758    None
Length: 128759, dtype: object

In [None]:
frequency_dict

{'sn': 62178,
 'hanging': 29623,
 'firearm': 16636,
 'benzodiazepines': 12490,
 'opioids': 8017,
 'antiemetic': 9092,
 'barbiturates': 4157,
 'acid regulator': 4321,
 'cyanides': 1874,
 'other preservatives': 1245,
 'plant-based poisons': 546,
 'household chemicals': 552,
 'ricin': 117,
 'nitric oxide': 113}

In [None]:
df.head(5)

Unnamed: 0,post_id,post_text,date,lemmatized_body,keywords_matched,group_name,date_n,year,month
0,1641475,this one can relate. symmetry is a huge hang u...,"Jun 16, 2022",one relate symmetry huge hang one taking pictu...,[hang],[hanging],2022-06-16,2022,6
1,1709493,pm me with a bit of a more detailed question\r...,"Sep 4, 2022",pm bit detailed question hamvil said get back ...,[hang],[hanging],2022-09-04,2022,9
2,233329,i feel like ”hanging” out.,"Feb 16, 2019",feel like ” hanging ”,[hanging],[hanging],2019-02-16,2019,2
3,241976,nervous \r\n\r\nbeen planning and reading a lo...,"Feb 22, 2019",nervous planning reading lot sn dmso get wedne...,[sn],[sn],2019-02-22,2019,2
4,409855,"very sleep, had an increase it carbmazapine an...","Sep 11, 2019",sleep increase carbmazapine given diazepam fee...,[diazepam],[benzodiazepines],2019-09-11,2019,9


In [None]:
df.to_csv(my_path+'KEYWORDS_FILTERED_PROCESSED_TEXT_CORPUS_v2.csv',index=False)

In [None]:
df.drop(['post_id','post_text','date','lemmatized_body','keywords_matched','group_name'], axis=1, inplace=True)
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 128759 entries, 0 to 128758
Data columns (total 2 columns):
 #   Column  Non-Null Count   Dtype 
---  ------  --------------   ----- 
 0   year    128759 non-null  object
 1   month   128759 non-null  object
dtypes: object(2)
memory usage: 2.0+ MB


In [None]:
# save data for monthly frequency visualization

df.to_csv(my_path+'keywords_filtered_posting_frequency_month_year.csv',index=False)