In [2]:
from typing import Dict, List, Tuple
import pandas as pd
import numpy as np
import datetime

from IPython import display

import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib import dates
import networkx as nx

from sklearn.ensemble import RandomForestRegressor
from sklearn. model_selection import train_test_split
from sklearn.preprocessing import OneHotEncoder
import os

sns.set_style('darkgrid')
plt.rcParams.update({'font.size': 16})


class tweetSentiment:
    
    def __init__(self, rawDir: str = './../meltwater/final_query/all_topics__',
                 region: str = 'bay_area'):
        self.mRegion = region
        self.mDir = rawDir + self.mRegion + "/"
        self.mColorDict = {'Negative':'red', 
                           'Neutral':'darkkhaki', 
                           'Positive':'royalblue', 
                           'NotRated':'darkcyan'}
        
    def getData(self):
        
        file_in = self.mDir.replace(self.mRegion + '/',  self.mRegion + '.csv')
        
        print(f"Reading data from {file_in}")
        allTweetsDF = pd.read_csv(file_in, low_memory=False)
        print(f"""Got all tweets data for {REGION} """
              f"""({len(allTweetsDF)} rows) from {file_in}""")
        
        dfS = self._prepTweetsDF4Analysis(allTweetsDF)
        return dfS

    def _prepTweetsDF4Analysis(self, allTweetsDF: pd.DataFrame):
        """Get the starting date of each week"""
        wkof = pd.DataFrame(allTweetsDF.groupby(['year_week']).min()['date']).reset_index(drop=False, 
                                                                                          inplace=False)
        wkof.rename(columns={'date': 'week_of'}, inplace=True)
        wkof

        df = allTweetsDF.merge(wkof)
        df['sentiment'] = df.sentiment.str.replace(' ', '')

        """Add one-hot-encoded sentiments into separate columns for later analysis"""
        s = pd.get_dummies(df.sentiment, prefix='sentiment')
        dfS = pd.concat([df, s], axis=0, sort=False)
        dfS.drop_duplicates(inplace=True)

        """Identify retweets, quotes, shares.  Add original tweets"""
        dfS['is_retweet'] = dfS.hit_sentence.str.startswith('RT @')
        dfS['is_quote'] = dfS.hit_sentence.str.startswith('QT @')
        dfS['is_share'] = dfS.is_retweet | dfS.is_quote
        dfS['is_url'] = dfS.hit_sentence.str.contains('http')
        dfS['orig_tweet'] = dfS.hit_sentence.str.replace('RT @', '').str.replace('QT @', '')
        dfS.head()

        """Clean up and finalize"""
        dfS.dropna(inplace=True, subset=['hit_sentence'])
        dfS.reset_index(inplace=True, drop=True)
        dfS.head(2)

        return dfS

    """Get primary weekly stats on sentiment and plot them"""
    def analyzeSharedTweetsSentiment(self, 
                                     dfS: pd.DataFrame, 
                                     showplots: bool,
                                     includeNotRated: bool = False
                                    ):

        """Identify shared tweets"""
        shares = dfS.is_share.sum()
        totals = len(dfS)
        frac_shares = float(shares) / totals
        unique_key_phrases = dfS.key_phrases.unique()
        
        print(f"We have {totals} tweets, of which {shares} ({frac_shares:.3f}) are retweets or quotes")
        print(f"We have {len(unique_key_phrases)} unique key phrases")

        """Get Total Number of Tweets in each sentiment for each week"""
        dfS_wk = pd.DataFrame(dfS.groupby(['week_of']).count()['sentiment'])
        dfS_wk.rename(columns={'sentiment': 'total'}, inplace=True)
        dfS_wk.reset_index(drop=False, inplace=True)
        dfS_wk.head()

        """Get Number of Shared Tweets in each (week_of, sentiment) tuple"""
        dfS_wk_share = pd.DataFrame(dfS.groupby(['week_of', 'sentiment']).sum()['is_share'])
        dfS_wk_share.rename(columns={'is_share': 'shared'}, inplace=True)
        dfS_wk_share.reset_index(drop=False, inplace=True)
        dfS_wk_share.head()

        """Join dfS_wk with dfS_wk_share"""
        dfSwk = dfS_wk.merge(dfS_wk_share)
        dfSwk['shared_frac_total'] = dfSwk.shared.astype(float) / dfSwk.total
        dfSwk.head()
        
        if not includeNotRated:
            dfSwk = dfSwk.loc[dfSwk.sentiment != 'NotRated'].copy().reset_index(drop=True, 
                                                                                 inplace=False)
            
        weekly_aggr_wide = dfSwk.pivot_table(index='week_of',
                                             columns='sentiment', 
                                             values='shared_frac_total').reset_index(drop=False)
        #         print(weekly_aggr_wide.columns)
        weekly_aggr_wide.head()

        if showplots:
            weekly_aggr_wide.plot(
                      x='week_of',
                      linewidth=3.0,
                      figsize=(16, 8),
                      color=[self.mColorDict.get(x, '#333333')
                             for x in weekly_aggr_wide.iloc[:, 1:].columns])
            plt.ylabel('Fraction of Total Shared Tweets', fontsize=18)
            plt.xlabel('Week Start Date', fontsize=18)
            plt.yticks(fontsize=14)
            plt.xticks(list(range(len(weekly_aggr_wide.week_of))), 
                            weekly_aggr_wide.week_of, 
                            rotation=90, 
                            fontsize=13)
            plt.legend(fontsize=18)
            plt.title(f"Dynamics of Sentiment in Shared Tweets for {self.mRegion}", fontsize=22)
            plt.show()
            
    """Get primary weekly stats on sentiment and plot them"""
    def analyzeSentiment(self,
                         dfS: pd.DataFrame,
                         showplots: bool,
                         includeNotRated: bool = False,
                         tweets_or_news: str = 'Tweets'
                        ):

        """Get weekly totals"""
        weekly_totals = ((dfS.groupby(['week_of'])).count()['year_week']).reset_index(drop=False)
        weekly_totals.rename(columns={'year_week': 'total'}, inplace=True)

        """Get weekly totals for each sentiment"""
        weekly_s_totals = ((dfS.groupby(['week_of', 'sentiment'])).count()['year_week']).reset_index(drop=False)
        weekly_s_totals.rename(columns={'year_week': 'sentiment_total'}, inplace=True)
        weekly_s_totals.head()

        """Join (merge) the two and compute weekly sentiment fractions"""
        weekly_aggr = weekly_totals.merge(weekly_s_totals)
        weekly_aggr['frac_sentiment'] = weekly_aggr.sentiment_total.astype(float) / weekly_aggr.total
        
        if not includeNotRated:
            weekly_aggr = \
                weekly_aggr.loc[weekly_aggr.sentiment != 'NotRated'].copy().reset_index(drop=True,
                                                                                        inplace=False)

        weekly_aggr_wide = weekly_aggr.pivot_table(index='week_of',
                                                   columns='sentiment', 
                                                   values='frac_sentiment').reset_index(drop=False)
        #         print(weekly_aggr_wide.columns)
        weekly_aggr_wide.head()

        if showplots:
            weekly_aggr_wide.plot(
                      x='week_of',
                      linewidth=3.0,
                      figsize=(16, 8),
                      color=[self.mColorDict.get(x, '#333333')
                             for x in weekly_aggr_wide.iloc[:, 1:].columns])
            plt.ylabel('Fraction of Total Tweets', fontsize=18)
            plt.xlabel('Week Start Date', fontsize=18)
            plt.yticks(fontsize=14)
            plt.xticks(list(range(len(weekly_aggr_wide.week_of))), 
                            weekly_aggr_wide.week_of, 
                            rotation=90, 
                            fontsize=13)
            plt.legend(fontsize=18)
            plt.title(f"Dynamics of Sentiment in All {tweets_or_news} for {self.mRegion}", fontsize=22)
            plt.show()