# Remote work sentiment analysis 
# Modelling Vader

# 0-Resources<a id='0_Resources'></a>

https://www.analyticsvidhya.com/blog/2021/01/sentiment-analysis-vader-or-textblob/

jupyter lab --NotebookApp.iopub_data_rate_limit=1.0e10

# 1-Setup<a id='1_Setup'></a>

## 1.1 Import<a id='1.1_Setup'></a>

In [None]:
## for data
import pandas as pd
import numpy as np

## for predicting
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
from sklearn.metrics.pairwise import cosine_similarity

import my_functions as func


## 1.2 Load Data<a id='1.2_Load_Data'></a>

In [None]:
df = pd.read_csv('tweets.csv')
df = df.drop(['Language','username','date'], 1)
# df = df.iloc[0:5000]
# df.reset_index(inplace = True, drop = True)

In [None]:
df

In [None]:
func.print_tweet(df,len(df))

# 2-Cleaning for BERT and VADER

## Remove 'read more'

<span style="background-color:Teal"> Lets remove phrases like **read more**, **learn more**, **find out more**, as they sually comes with links where to read them after<span>

In [None]:
df = func.remove_read_more(df)

In [None]:
# save data as we want to use this also for BERT
df.to_csv('tweetBERT.csv', index=False)

# 3-For VADER only

## 3.1 Remove ending hashtags


In [None]:
df_vader = df.copy()
df_vader = func.remove_end_hashtag(df_vader)


## 3.2 Lemmatize

In [None]:
df_vader['original_tweet'] = df_vader['tweet']


In [None]:
func.lemmatized_df(df_vader)

## 3.3 Remove special char


In [None]:
df_vader = func.remove_char_vader(df_vader)


# 4-Pick Two pipeline

## 4.1 Pipeline1: Split hashtag


In [None]:
# copy the data frame so we can split this
df_vader_split = df_vader.copy()

# create new column that we can use to identify which rows has been split
df_vader_split['middle_hashtag'] = False

In [None]:
df_vader_split

In [None]:
func.print_tweet(df_vader_split, len(df_vader_split))

In [None]:
df_vader_split = func.custom_pipeline(df_vader_split,
                     del_split_hashtag = True, #optional
                     del_stopwords = True,  
                     del_short_tweets = False)


In [None]:
func.print_tweet(df_vader_split, len(df_vader_split))

## 4.2 Pipeline2: Don't Split Hashtag

In [None]:
df_vader_nosplit = df_vader.copy()

df_vader_nosplit = func.custom_pipeline(df_vader_nosplit,
                    del_split_hashtag = False, #optional
                    del_stopwords = True,  
                    del_short_tweets = False)

In [None]:
print(df_vader_nosplit.shape)
print(df_vader_split.shape)

# 5-Refactor the two dfs, make sure they are of the same shape

<span style='background-color:Teal'>We want to compare if splitting hashtags has the same score, so lets make sure they are of the same shape first to compare index by index easily<span>

In [None]:
indices_to_remove = []
for i in range(len(df_vader_nosplit)):
    if(df_vader_nosplit.iloc[i].original_tweet not in df_vader_split.original_tweet.tolist()):
        indices_to_remove.append(i)
#         print('\n',i)
#         print(df_vader_nosplit.iloc[i].original_tweet)

In [None]:
df_vader_nosplit.drop(indices_to_remove, axis=0, inplace=True)
df_vader_nosplit.reset_index(drop=True, inplace=True)
df_vader_nosplit.shape


In [None]:
df_vader_nosplit

# 6-Predict Sentiment with split hashtag

<span style="background-color:Teal">If compound score is +ve then sentiment is +ve, -ve compound score means negative sentiment<span>

<span style="background-color:Teal">Let's assign a score to each row in the df<span>

In [None]:
sid = SentimentIntensityAnalyzer()

# 2. Assign Score function
def assign_score(the_df):
    
    '''Assign compound sentiment score to each tweets in a dataframe'''
    print('df length: ',len(the_df))
    score_list = []
    sentiment_list =[]
    # 1. create new column for the scores
    the_df['VaderScore'] = 0
    the_df['VaderSentiment'] = ''

    # 2. Get the score
    for i in range(len(the_df)):
#         print(i)
        text = the_df.iloc[i].tweet
        score = sid.polarity_scores(text)['compound']
        if(score < 0 ):
            sentiment = 'neg'
        elif(score == 0):
            sentiment = 'neu'
        else:
            sentiment = 'pos'
#         print(score)
        score_list.append(score)
        sentiment_list.append(sentiment)
    
    
    # 3. Update df scores
    the_df['VaderScore'] = pd.DataFrame(score_list)
    the_df['VaderSentiment'] = pd.DataFrame(sentiment_list)
    

<span style="background-color:Teal">Let's make the prediction<span>

In [None]:
# predict
assign_score(df_vader_split)

In [None]:
df_vader_split

# 7-Predict Sentiment with non-split hashtag


In [None]:
# Check if split and nosplit has the same size
assign_score(df_vader_nosplit)
df_vader_nosplit

# 8-Compare split and non-split tweets

<span style='background-color:Teal'>We want to see which is better. to split or not to split the hashtag<span>

In [None]:
def compare_tweet(the_df_split, the_df_notsplit):
    count = 0;
    for i in range(len(the_df_split)):
        # only print where we there is hashtag in the middle of a sentence
        # and if sentiment result is different      
        if((the_df_split.iloc[i].middle_hashtag == True)
        and (the_df_split.iloc[i].VaderSentiment != the_df_split.iloc[i].VaderSentiment)):
            count +=1
            print('****SPLIT******:')
            print('TWEET: ', the_df_split.iloc[i].tweet)
            print('SENTIMENT: ', the_df_split.iloc[i].VaderSentiment)
            print('SCORE: ', the_df_split.iloc[i].VaderScore,'\n')
                
            print('****NOT SPLIT******:')
            print('TWEET: ', the_df_notsplit.iloc[i].tweet)
            print('SENTIMENT: ', the_df_notsplit.iloc[i].VaderSentiment)
            print('SCORE: ', the_df_notsplit.iloc[i].VaderScore,'\n')
    if(count == 0):
        print('all sentiment result are the same')        

In [None]:
compare_tweet(df_vader_split, df_vader_nosplit)

<span style='background-color:Teal'> There is no effect in splitting hashtag, I guess, the hashtag has not sentiment <span>

# 9-View tweets of both sentiments

In [None]:
def print_sentiment_tweet(the_df, sentiment = 'neg'):
    count = 0;
   
    if(sentiment == 'neg'):
        print('printing Negative tweets:')
        for i in range(len(the_df)):  
            if(the_df.iloc[i].VaderSentiment == 'neg'):
                print('\n',i)
                print(the_df.iloc[i].original_tweet)
    else:
        print('printing Positive tweets:')
        for i in range(len(the_df)):
            if(the_df.iloc[i].VaderSentiment == 'pos'):
                print('\n',i)
                print(the_df.iloc[i].original_tweet)


In [None]:
# 1. print negative
print_sentiment_tweet(df_vader_split, 'neg')

In [None]:
# 2. print positive
print_sentiment_tweet(df_vader_split, 'pos')

<span style='background-color:Teal'> 
There are positive tweets that still identifies the problem with remote work: <br>
- I believe remote work is suitable for routine, transactional work, but I expect it  <span STYLE="font-weight:bold">**limits culture**</span> to the middle of the curve. Atomic's flexible, co-located model is critical for our future success."<span>

# 10-Compare number of positive vs negative tweets

In [None]:
df_vader = df_vader_split.copy()

In [None]:
df_vader[df_vader.VaderSentiment == 'neg'].shape

In [None]:
df_vader[df_vader.VaderSentiment == 'pos'].shape

<span style='background-color:Teal'> 
There are 3 times more positive tweets than negative tweets<span>