In [1]:
# TODO: create a deployment package using AWS Lambda Layers

# to manipulate dataframes
import pandas as pd

# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# for natural language processing: named entity recognition
import spacy
from collections import Counter

# for natural language processing: sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import word_tokenize, sent_tokenize

# add appropriate words that will be ignored in the analysis
ADDITIONAL_STOPWORDS = ['nan']

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ednalyndedios/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ednalyndedios/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# TODO: setup trigger on S3 bucket

# get the data; text data MUST be in the first column
df = pd.read_csv('../data/input/data.csv')

In [3]:
# removes null values
df = df.loc[df.iloc[:,0].notnull()]

# N-gram Ranking

In [4]:
def clean(text):
  """
  A simple function to clean up the data. All the words that
  are not designated as a stop word is then lemmatized after
  encoding and basic regex parsing are performed.
  """
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english')+ ADDITIONAL_STOPWORDS
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]

def get_bigrams(content):
    """
    Takes in a list of words and returns a dataframe with the top 20 bigrams.
    """
    bigrams = (pd.Series(nltk.ngrams(content, 2)).value_counts())[:10].to_frame().reset_index()
    bigrams.columns=['bigram', 'count']
    return bigrams

def get_trigrams(content):
    """
    Takes in a list of words and returns a dataframe with the top 20 trigrams.
    """
    trigrams = (pd.Series(nltk.ngrams(content, 3)).value_counts())[:10].to_frame().reset_index()
    trigrams.columns=['trigram', 'count']
    return trigrams

In [5]:
# converts to a list of clean tokens
content = clean(''.join(str(df.iloc[:,0].tolist())))

In [6]:
# TODO: push to S3
# exports to a flat file
get_bigrams(content).to_csv('../data/output/output_data_ngram_bigrams.csv', index=False)

In [7]:
# TODO: push to S3
# exports to a flat file
get_trigrams(content).to_csv('../data/output/output_data_ngram_trigrams.csv', index=False)

# Sampling 1K

In [8]:
num_rows = df.shape[0]

In [9]:
if num_rows > 5000:
    # sampling the data to select only 1000 records
    df = df.sample(5000, random_state=493).reset_index(drop=True)

# Named Entity Recognition

In [10]:
def get_entities(entities, ent_type):
    group_list = []
    for entity in entities:
        if entity.label_ == ent_type:
            group_list.append(entity.text)
    df_entities = pd.DataFrame(Counter(group_list).most_common(20))
    df_entities.columns=['entity', 'count']
    return df_entities

In [11]:
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "textcat"])

In [12]:
%%timeit
df['tokens'] = [nlp(''.join(str(row))) for row in df.iloc[:,0]]

1.54 s ± 231 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)


In [13]:
df['entities'] = [tokens.ents for tokens in df.tokens]

In [14]:
entities = df.entities.tolist()

In [15]:
entities = [item for sublist in (df['entities'].tolist()) for item in sublist]

In [16]:
target_entities = ['PERSON', 'NORP', 'FAC', 'ORG', 'GPE', 'LOC', 'PRODUCT', 'EVENT']

for item in target_entities:
    file_name = '../data/output/output_data_ner_{}.csv'.format(item.lower())
    
    try:
        get_entities(entities, item).to_csv(file_name, index=False)
    except:
        pass

In [17]:
df = df.drop(columns=['tokens', 'entities'])

# Sentiment Analysis

In [18]:
nltk.download('vader_lexicon')
sentiment = SentimentIntensityAnalyzer()

[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/ednalyndedios/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!


In [19]:
df['polarity_score'] = df.iloc[:,0].apply(lambda x: sentiment.polarity_scores(str(x))['compound'])
df['neutral'] = df.iloc[:,0].apply(lambda x:sentiment.polarity_scores(str(x))['neu'])
df['negative'] = df.iloc[:,0].apply(lambda x:sentiment.polarity_scores(str(x))['neg'])
df['positive'] = df.iloc[:,0].apply(lambda x:sentiment.polarity_scores(str(x))['pos'])

In [20]:
df.loc[(df['polarity_score'] > 0.78), 'sentiment'] = 'Strongly Positive'
df.loc[(df['polarity_score'] >= 0.30) & (df['polarity_score'] <= 0.78), 'sentiment'] = 'Moderate Positive'
df.loc[(df['polarity_score'] < 0.30) & (df['polarity_score'] >= -0.30), 'sentiment'] = 'Neutral'
df.loc[(df['polarity_score'] < -0.30) & (df['polarity_score'] >= -0.55), 'sentiment'] = 'Moderate negative'
df.loc[(df['polarity_score'] < -0.55), 'sentiment'] = 'Strongly Negative'

In [21]:
df.to_csv('../data/output/output_data_sentiment.csv', index=False)

In [22]:
df = df.drop(columns=['polarity_score', 'neutral', 'negative', 'positive', 'sentiment'])