# Keyword Extraction and Analysis (<1 min)
This notebook reads in the provided sample data, tokenizes the narratives, identifies possible keywords using log odds, and analyzes the final set of chosen keywords. The analysis includes calculating the total number of narratives containing each keyword and organizes keywords by topic for Table 2 in the final submission. 

Create a virtual environment using `requirements.txt`, which should provide all necessary packages for this notebook (and notebook 2, but not notebook 3).

#### Imports and Reading Data
Creates `cleaned-nvdrs-youth-restricted.csv`, which just adds a column for the combined LE and CME narratives. It may be the case that the excluded NLTK downloads are required on the first run. 

In [None]:
import pandas as pd
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, sent_tokenize
from nltk.stem import WordNetLemmatizer
import string
import re
import gensim
import numpy as np
from collections import Counter

#! might have to run these lines once
# nltk.download('stopwords')
# nltk.download('punkt_tab')
# nltk.download('wordnet')

narratives = pd.read_csv("../data/raw/nvdrs-youth-restricted.csv")
narratives["combined_narratives"] = narratives["NarrativeLE"] + \
    narratives['NarrativeCME']
narratives.to_csv("../data/interim/cleaned-nvdrs-youth-restricted.csv", index=False)
narratives = narratives[[
    "uid", "combined_narratives", "DisclosedToSocialMedia"]]
narratives

#### Tokenization
Creates `narr-tokens.csv`, a CSV file containing the tokenized text of the narrative. 

In [None]:
stop_words = set(stopwords.words('english'))
lemmatizer = WordNetLemmatizer()
narrative_tokens = list()

# Token filter; removes stop words, punctuation, small words, and all 'X' words
def token_filter(word) : 
  return (word not in stop_words) & (word not in string.punctuation) & (len(word) > 2) & any(char != 'x' for char in word) and not any(char.isdigit() for char in word)

for narrative in narratives['combined_narratives'] : 
  # Handle punctuation between words without spaces (e.g. 'vehicle.the')
  narrative = re.sub(r'([a-zA-Z])\.([a-zA-Z])', r'\1 \2', narrative)
  narrative = re.sub(r'([a-zA-Z]),([a-zA-Z])', r'\1 \2', narrative)

  tokens = [word.lower() for word in word_tokenize(narrative) if token_filter(word.lower())]
  narrative_tokens.append(tokens)

narrative_tokens = pd.DataFrame(narrative_tokens)
narrative_tokens['DisclosedToSocialMedia'] = narratives['DisclosedToSocialMedia']
narrative_tokens.to_csv('../data/interim/narr-tokens.csv', index=False)
narrative_tokens

#### Word to Vector
Creates `potential-kws.csv`, a list of words similar to the source words which should be considered to be used as a keyword. 

In [None]:
narrative_tokens = pd.read_csv('../data/interim/narr-tokens.csv', low_memory=False)
narrative_tokens = narrative_tokens.loc[:, ~narrative_tokens.columns.str.contains('^Unnamed')]
narrative_tokens = narrative_tokens.apply(lambda x: x.dropna().tolist(), axis=1)
narrative_tokens = narrative_tokens.to_list()

model1 = gensim.models.Word2Vec(narrative_tokens, min_count=10,
                                vector_size=100, window=5)

source_words = ['image', 'posting', 'copy', 'monitor', 'posted', 'reply', 'instagram', 'facebook']

similar_words = set()

for word in source_words: 
  for similar_word in model1.wv.most_similar(word, topn=25):
    similar_words.add(similar_word[0])

similar_words = pd.DataFrame(similar_words)
similar_words.to_csv('../data/interim/potential-kws.csv', index=None)
similar_words


#### Keyword Extraction
Creates `tokens.csv`, a list of words from all narratives with their respective log odds which is used to identify keyword candidates. 

In [None]:
RARITY_THRESHOLD = 12

narrative_tokens = pd.read_csv("../data/interim/narr-tokens.csv", low_memory=False)
D_mask = narrative_tokens['DisclosedToSocialMedia']
D_mask = D_mask.apply(lambda x: x == 1)
narrative_tokens = narrative_tokens.drop('DisclosedToSocialMedia', axis=1)

# count number of occurences of each token in a series of narratives filtered by mask
def count_tokens(narrs, mask):
  narrs = narrs[mask].apply(lambda x: x.dropna().tolist(), axis=1)
  narrs = narrs.apply(lambda x: [i for i in x if i != ''])
  narrs = narrs.apply(lambda x: Counter(x)).tolist()
  narrs = pd.DataFrame(narrs).fillna(0).astype(int)
  tokens = narrs.sum(axis=0).to_frame('count')
  return tokens

D_tokens = count_tokens(narrative_tokens, D_mask)
ND_tokens = count_tokens(narrative_tokens, ~D_mask)

# Merge tokens
tokens = pd.merge(D_tokens, ND_tokens, how='outer', left_index=True, right_index=True, suffixes=('_D', '_ND'))
tokens = tokens.fillna(0)
tokens['count_D'] = tokens['count_D'] + 1
tokens['count_ND'] = tokens['count_ND'] + 1

# Compute total count across all disclosure/non-disclosure narratives
total_D_tokens = tokens['count_D'].sum()
print('Total Disclosure Tokens:', total_D_tokens)
total_ND_tokens = tokens['count_ND'].sum()
print('Total Non-Disclosure Tokens:', total_ND_tokens)

# Features
tokens['count'] = tokens['count_D'] + tokens['count_ND']
tokens = tokens[tokens['count'] > RARITY_THRESHOLD]
tokens['prob_D'] = tokens['count_D'] / total_D_tokens
tokens['prob_ND'] = tokens['count_ND'] / total_ND_tokens
tokens['odds_D'] = tokens['prob_D'] / (1 - tokens['prob_D'])
tokens['odds_ND'] = tokens['prob_ND'] / (1 - tokens['prob_ND'])
tokens['odds_ratio'] = tokens['odds_D'] / tokens['odds_ND']
tokens['log_odds'] = tokens['odds_ratio'].apply(lambda x: np.log(x) if x > 0 else 0)

# Cleanup
tokens = tokens.sort_values(['log_odds'], ascending=[False])
tokens.index.name = 'word'
tokens.sort_values(['count'], ascending=[False], inplace=True)
tokens.to_csv("../data/interim/tokens.csv")

#### Keyword Analysis
Creates `topic-crosstab.csv`, a table with the counts and prevalence values of each group of keywords in the text. 

In [None]:
def takeout_text(text):
  text = re.search('message', re.sub('text messag', '', text))
  return text


narratives['tokens'] = narrative_tokens.apply(
  lambda row: row.dropna().tolist(), axis=1)
narratives

In [None]:
narratives['found_keywords'] = ''
narratives['topics'] = [[] for _ in range(len(narratives))]
narratives

In [None]:
def create_mask(row, keyword):
  if ' ' in keyword:
    keyword_parts = keyword.split()
    return any(row['tokens'][i:i + len(keyword_parts)] == keyword_parts for i in range(len(row['tokens']) - len(keyword_parts) + 1))
  else:
    return any(word == keyword for word in row['tokens'])


topics = {
  'Web': ['online', 'cyber', 'account', 'web', "website", "webpage", "webpages"],
  'Social Media': ['facebook', 'instagram', 'youtube', 'twitter', 'discord', 'network', 'forum', 'app', 'apps', 'delete', 'deleted', "caption"],
  'Message': ['post', 'posts', 'posted', 'posting', 'chat', 'chatting', 'chatted', 'chatroom', 'chatrooms', "message", "messaged", "messages", "messaging"],
  'Other': ['game', 'gaming', 'games', 'dating', 'porn', 'stalk', 'image', 'stream', 'computer', 'laptop', 'internet', 'email', 'emailed', 'search history', 'search engine', 'device', "electronic", "digital", 'browser']
}

keywords = pd.DataFrame([(keyword, topic) for topic, keywords in topics.items(
) for keyword in keywords], columns=['keyword', 'topic'])
keywords.groupby('topic').count()

In [None]:
for idx, row in narratives.iterrows():
  found_keywords = []
  for keyword in keywords['keyword']:
    if create_mask(row, keyword):
      found_keywords.append(keyword)

  narratives.at[idx, 'found_keywords'] = ', '.join(found_keywords)
  narratives.at[idx, 'topics'] = list(set(
    [keywords.loc[keywords['keyword'] == keyword, 'topic'].iloc[0] for keyword in found_keywords]))

narratives

In [None]:
crosstab = pd.DataFrame(columns=[
    'Keyword', 'Total', 'Social Media Disclosure', 'No Social Media Disclosure', 'Topic'])

for index, keyword in enumerate(keywords['keyword']):
  contains_KW = narratives['tokens'].apply(
    lambda x: create_mask({'tokens': x}, keyword))

  filtered_youth_narratives = narratives[contains_KW]

  crosstab_values = filtered_youth_narratives['DisclosedToSocialMedia'].value_counts().to_dict()

  crosstab = pd.concat([crosstab, pd.DataFrame({
      'Keyword': [keyword], 
      'Total': [contains_KW.sum()], 
      'Social Media Disclosure': [crosstab_values.get(True, 0)], 
      'No Social Media Disclosure': [crosstab_values.get(False, 0)], 
      'Topic': [keywords.loc[index, 'topic']]
  })], ignore_index=True)

crosstab = crosstab.sort_values(by=['Topic', 'Total'], ascending=[True, False])
crosstab.to_csv('../data/interim/used-keyword-analysis.csv', index=False)
crosstab

In [None]:
topic_crosstab_values = pd.DataFrame(columns=['Topic', 'Total', 'Social Media Disclosure', 'No Social Media Disclosure'])

for topic in narratives['topics'].explode().unique():
    filtered_youth_narratives = narratives[narratives['topics'].apply(lambda x: topic in x)]

    crosstab_addition = pd.crosstab(index=filtered_youth_narratives['DisclosedToSocialMedia'],
                           columns=['Total']).transpose()

    crosstab_columns = crosstab_addition.columns.tolist()
    crosstab_values = {
        'Topic': topic,
        'Total': filtered_youth_narratives.shape[0],
        'Social Media Disclosure': crosstab_addition[1].iloc[0] if 1 in crosstab_columns else 0,
        'No Social Media Disclosure': crosstab_addition[0].iloc[0] if 0 in crosstab_columns else 0
    }

    topic_crosstab_values = pd.concat([topic_crosstab_values, pd.DataFrame([crosstab_values])], ignore_index=True)

topic_crosstab_values.to_csv('../data/processed/topic-crosstab.csv', index=False)
topic_crosstab_values

In [None]:
narratives = narratives[narratives['found_keywords'] != '']

narratives_count = narratives.shape[0]
disclosure_count = narratives['DisclosedToSocialMedia'].sum()
non_disclosure_count = narratives.shape[0] - disclosure_count

print(f"Total: {narratives_count}")
print(f"Disclosure Count: {disclosure_count}")
print(f"Non-Disclosure Count: {non_disclosure_count}")

subset = narratives[['uid', 'combined_narratives', 'found_keywords', "DisclosedToSocialMedia"]]
subset = subset.sample(frac=1).reset_index(drop=True)
subset.to_csv("../data/interim/subset.csv", index=False)
subset