In [28]:
import nltk

nltk.download([
    "names",
    "stopwords",
    "averaged_perceptron_tagger",
    "vader_lexicon",
    "punkt",
    ])

[nltk_data] Downloading package names to /Users/eltontay/nltk_data...
[nltk_data]   Package names is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eltontay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/eltontay/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package vader_lexicon to
[nltk_data]     /Users/eltontay/nltk_data...
[nltk_data]   Package vader_lexicon is already up-to-date!
[nltk_data] Downloading package punkt to /Users/eltontay/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


True

In [29]:
import pandas as pd
import numpy as np
from datetime import datetime
import json

import nltk
nltk.download('omw-1.4')
nltk.download('stopwords')
nltk.download('wordnet')
from nltk.corpus import stopwords
from nltk.stem.wordnet import WordNetLemmatizer
from nltk.sentiment import SentimentIntensityAnalyzer

import string
import re

[nltk_data] Downloading package omw-1.4 to
[nltk_data]     /Users/eltontay/nltk_data...
[nltk_data]   Package omw-1.4 is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/eltontay/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/eltontay/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [1]:
def remove_contractions(text):
    text = text.replace("'s","")
    return text

def remove_punctuation(text):
    text = re.sub(r'[^\w\s]', '', text)
    return text

def lowercase(text):
    return text.lower()

def remove_stopwords(text):
    stopwords = nltk.corpus.stopwords.words('english')
    text = ' '.join([word for word in text.split() if word not in stopwords])
    return text

def remove_special_character(text):
    text = text.replace('\n', ' ') 
    return text

def lemmatize(text):
    lem = WordNetLemmatizer()
    corpus = ' '.join([lem.lemmatize(x, pos = 'v') for x in text.split()])
    return corpus

def preprocess_text(text):
    text = remove_contractions(text)
    text = remove_punctuation(text)
    text = lowercase(text)
    text = remove_stopwords(text)
    text = remove_special_character(text)
    text = lemmatize(text)
    return text


In [2]:
sia = SentimentIntensityAnalyzer()

def vader_process(df) : 
    df['Date'] = pd.to_datetime(df['Date'])
    df['Processed Title'] = df['Title'].apply(lambda x: preprocess_text(x))
    # df['Processed Text'] = df['Text'].apply(lambda x: preprocess_text(x))
    df['Sentiment Title'] = df['Processed Title'].apply(lambda x: sia.polarity_scores(x))
    # df['Sentiment Text'] = df['Processed Text'].apply(lambda x: sia.polarity_scores(x))
    # df['Positive Title'] = df['Sentiment Title'].apply(lambda x: x["compound"] > 0)
    # df['Positive Text'] = df['Sentiment Text'].apply(lambda x: x["compound"] > 0)
    return df

NameError: name 'SentimentIntensityAnalyzer' is not defined

In [48]:
def sentiment_aggregator(df, title = True, type="mean"):
    """
    Aggregates sentiments on a per day basis.

    Parameters
    ----------
    df: DataFrame
        Dataset generated after sentiment analysis.
    title: boolean
        To indicate if the news title or news body text is used to generate the aggregated sentiment. 
        Default is True (ie. News title is used for aggregated sentiment)
    type: Str {"mean", "abs_max"}
        To indicate method of calculation.
        "mean": Group by Date and takes mean of "Compound"
        "abs_max": Calculates the absolute max of "Positive" and "Negative" column. Then group by Date and takes mean of this new column

    Returns
    -------
    Output : Series
        Contains aggregated sentiment for each day
    """
   
    target = "Sentiment Title"
    #  if title else "Sentiment Text"
    
    df[target] = df[target].str.replace('\'','\"')
    df[target] = df[target].apply(lambda x: json.loads(x))

    df['Negative'] = df[target].apply(lambda x: x.get('neg'))
    df['Neutral'] = df[target].apply(lambda x: x.get('neu'))
    df['Positive'] = df[target].apply(lambda x: x.get('pos'))
    df['Compound'] = df[target].apply(lambda x: x.get('compound'))

    if type == "mean":
        return df.groupby('Date')['Compound'].aggregate('mean')

    elif type == "abs_max":
        df['Negative'] = df[target].apply(lambda x: -x.get('neg'))
        df['Sentiment'] = df.apply(lambda x: max(x['Negative'], x['Positive'], key=abs), axis=1)

        return df.groupby('Date')['Sentiment'].aggregate('mean')


In [61]:
def clean_all_data():
    df_world= pd.read_csv('../Data/1_Scraped/Unstructured_Data/Global/world_news.csv')
    df_politics= pd.read_csv('../Data/1_Scraped/Unstructured_Data/Global/politics_news.csv')
    df_coronavirus= pd.read_csv('../Data/1_Scraped/Unstructured_Data/Global/coronavirus_news.csv')
    df_aapl= pd.read_csv('../Data/1_Scraped/Unstructured_Data/Stock/aapl_news.csv')
    df_meta= pd.read_csv('../Data/1_Scraped/Unstructured_Data/Stock/meta_news.csv')
    df_tsla= pd.read_csv('../Data/1_Scraped/Unstructured_Data/Stock/tsla_news.csv')

    # Removing duplicates
    df_world = df_world.drop_duplicates(subset=['Date', 'Title'], keep='first')
    df_politics = df_politics.drop_duplicates(subset=['Date', 'Title'], keep='first')
    df_coronavirus = df_coronavirus.drop_duplicates(subset=['Date', 'Title'], keep='first')
    df_aapl = df_aapl.drop_duplicates(subset=['Date', 'Title'], keep='first')
    df_meta = df_meta.drop_duplicates(subset=['Date', 'Title'], keep='first')
    df_tsla = df_tsla.drop_duplicates(subset=['Date', 'Title'], keep='first')
    
    # Further filtering based on keyword
    # aapl_keyword = ['Apple', 'iPhone', 'Macbook', 'iPad']
    # meta_keyword = ['Facebook', 'Meta', 'Metaverse']
    # tsla_keyword = ['Tesla', 'EV', 'Elon', 'Musk']

    # df_aapl = df_aapl[df_aapl['Title'].str.contains('|'.join(aapl_keyword)) == True]
    # df_meta = df_meta[df_meta['Title'].str.contains('|'.join(meta_keyword)) == True]
    # df_tsla = df_tsla[df_tsla['Title'].str.contains('|'.join(tsla_keyword)) == True]

    # Drop any Nan 
    df_world = df_world.dropna()
    df_politics = df_politics.dropna()
    df_coronavirus = df_coronavirus.dropna()
    df_aapl = df_aapl.dropna()
    df_meta = df_meta.dropna()
    df_tsla = df_tsla.dropna()
    
    return df_world, df_politics, df_coronavirus, df_aapl, df_meta, df_tsla

def vader_all():
    df_world, df_politics, df_coronavirus, df_aapl, df_meta, df_tsla = clean_all_data()
    
    df_aapl = vader_process(df_aapl)
    df_meta = vader_process(df_meta)
    df_tsla = vader_process(df_tsla)
    df_world = vader_process(df_world)
    df_politics = vader_process(df_politics)
    df_coronavirus = vader_process(df_coronavirus)

    df_aapl.to_csv('../Data/2_Processed/Unstructured_Data/Stock/aapl_vader.csv',index=False)
    df_meta.to_csv('../Data/2_Processed/Unstructured_Data/Stock/meta_vader.csv',index=False)
    df_tsla.to_csv('../Data/2_Processed/Unstructured_Data/Stock/tsla_vader.csv',index=False)
    df_world.to_csv('../Data/2_Processed/Unstructured_Data/Global/world_vader.csv',index=False)
    df_politics.to_csv('../Data/2_Processed/Unstructured_Data/Global/politics_vader.csv',index=False)
    df_coronavirus.to_csv('../Data/2_Processed/Unstructured_Data/Global/coronavirus_vader.csv',index=False)
    

def aggregate_sentiment_all(title, type):
    df_world_vader= pd.read_csv('../Data/2_Processed/Unstructured_Data/Global/world_vader.csv')
    df_politics_vader= pd.read_csv('../Data/2_Processed/Unstructured_Data/Global/politics_vader.csv')
    df_coronavirus_vader= pd.read_csv('../Data/2_Processed/Unstructured_Data/Global/coronavirus_vader.csv')
    df_aapl_vader= pd.read_csv('../Data/2_Processed/Unstructured_Data/Stock/aapl_vader.csv')
    df_meta_vader= pd.read_csv('../Data/2_Processed/Unstructured_Data/Stock/meta_vader.csv')
    df_tsla_vader= pd.read_csv('../Data/2_Processed/Unstructured_Data/Stock/tsla_vader.csv')

    aggregated_sentiment_aapl = sentiment_aggregator(df_aapl_vader, title=title, type=type)
    aggregated_sentiment_meta = sentiment_aggregator(df_meta_vader, title=title, type=type)
    aggregated_sentiment_tsla = sentiment_aggregator(df_tsla_vader, title=title, type=type)
    aggregated_sentiment_world = sentiment_aggregator(df_world_vader, title=title, type=type)
    aggregated_sentiment_politics = sentiment_aggregator(df_politics_vader, title=title, type=type)
    aggregated_sentiment_coronavirus = sentiment_aggregator(df_coronavirus_vader, title=title, type=type)

    lst = [aggregated_sentiment_aapl, aggregated_sentiment_meta, aggregated_sentiment_tsla, aggregated_sentiment_world, aggregated_sentiment_politics, aggregated_sentiment_coronavirus]
    keys = ["AAPL", "META", "TSLA", "World", "Politics", "Coronavirus"]
    
    return pd.concat(lst, keys=keys, axis=1)


In [62]:
vader_all()

In [63]:
#vader_all()
df_all = aggregate_sentiment_all(title=True, type="abs_max")
df_all = df_all.sort_values(by="Date")
df_all.to_csv('../Data/2_Processed/Unstructured_Data/All/all_vader.csv')