# Creative Extension Analysis

---

**** Explain how we get the dataset

In [1]:
# Importing the libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# Hiding the warnings
import warnings; warnings.simplefilter('ignore') # can we add this one to not see the warnings every time?

# Set random seed for consistency 
import random
from numpy.random import seed as random_seed
from numpy.random import shuffle as random_shuffle
seed = 42
random.seed(seed)
np.random.seed(seed)

# Vader 
import vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# Bert sa-pipeline
from transformers import pipeline

from scipy import stats


## Data preprocessing 

---

In [2]:
# Loading the dataset

tweets = pd.read_csv('./Data/tweets',low_memory=False)
tweets.head()

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_followers_count,user_friends_count,user_listed_count,user_location,user_name,user_screen_name.1,user_statuses_count,user_time_zone,user_urls,user_verified
0,,Sun Feb 14 23:29:33 +0000 2010,,,,0,9116606357,,,,...,29,17,0,"Sofia, Bulgaria",Ina Gerdjikova,InaGerdjikova,6,,http://kutiazamisli.blogspot.com/,False
1,,Fri Sep 24 20:40:25 +0000 2010,,,,0,25437022568,,,,...,12,9,0,,Ana Paula Ribeiro,anadadal,7,,,False
2,,Tue Jun 14 07:54:35 +0000 2011,,,http://kutiazamisli.blogspot.com/2011/06/blog-...,0,80543642816757761,,,,...,29,17,0,"Sofia, Bulgaria",Ina Gerdjikova,InaGerdjikova,6,,http://kutiazamisli.blogspot.com/,False
3,,Tue Aug 12 17:51:37 +0000 2014,,,http://twitcam.livestream.com/g7hpe,0,499251874256920577,,,,...,12,9,0,,Ana Paula Ribeiro,anadadal,7,,,False
4,,Fri Oct 25 11:38:21 +0000 2013,SmurfsVillage,,http://bit.ly/Smurf47,0,393703073228808192,,,,...,6,17,0,Land of the happily single,sheena c wallace,sheasofly,268,,,False


In [3]:
# Exploring the dataset

print(f"In total there are {len(tweets)} tweets.")

In total there are 1609389 tweets.


In [4]:
tweets.columns

Index(['coordinates', 'created_at', 'hashtags', 'media', 'urls',
       'favorite_count', 'id', 'in_reply_to_screen_name',
       'in_reply_to_status_id', 'in_reply_to_user_id', 'lang', 'place',
       'possibly_sensitive', 'retweet_count', 'retweet_id',
       'retweet_screen_name', 'source', 'text', 'tweet_url', 'user_created_at',
       'user_screen_name', 'user_default_profile_image', 'user_description',
       'user_favourites_count', 'user_followers_count', 'user_friends_count',
       'user_listed_count', 'user_location', 'user_name', 'user_screen_name.1',
       'user_statuses_count', 'user_time_zone', 'user_urls', 'user_verified'],
      dtype='object')

> For this analysis we are only concerned with english tweets, so we need to filter the data using the language information

In [5]:
# Selecting only english tweets

en_tweets = tweets.loc[tweets['lang']=='en']
print(f"In total there are {len(en_tweets)} english tweets.")

In total there are 1093122 english tweets.


> We also want to drop duplicate tweets. The reason for this is because if a tweet is repeated many times, then it is most likely due to a bot account. With this analysis we are concern with the sentiment of real users, so data from bot accounts could disrupt the results

In [6]:
# Demonstrating examples of duplicate tweets
en_tweets.value_counts(subset = 'text', sort = True)[0:5]


text
IF U HAVE A FACEBOOK ACCOUNT,PLZ TELL ME UR ACCOUNTNAME&lt;333 U U TELL ME IT WHEN I TELL U MINE:)    2738
Here is a neat little package to create banners  http://perfecttrafficstorm.com/links/19768            171
Check this Out  http://perfecttrafficstorm.com/links/19778                                             170
Grab this package of articles    http://perfecttrafficstorm.com/links/19769                            169
Check out the PLR CHEATER http://perfecttrafficstorm.com/links/19773                                   169
dtype: int64

In [7]:
# Dropping all duplicates tweets
en_tweets.drop_duplicates(subset=['text'], keep = False, inplace = True)
print(f"After dropping the duplicates, there are {len(en_tweets)} english tweets.")


After dropping the duplicates, there are 1054619 english tweets.


In [8]:
# Checking how many users we have 
usernames = en_tweets.value_counts(subset = 'user_name', sort = True)
print(f"There are a total of {len(usernames)} users which posted {len(en_tweets)} english tweets.")
usernames.head(5)


There are a total of 6167 users which posted 1054619 english tweets.


user_name
michelle              3906
Austin Health Jobs    3197
Mark Whittington      3189
Games For Pro         3188
BELIVE ADULT          3162
dtype: int64

In [9]:
# Giving a look to some tweet texts
# it is not printing all of it

print(en_tweets.text.iloc[0])
print(en_tweets.text.iloc[1])

en_tweets.text.head(5)

Will try to live for a week offline... wish me luck! :D
Boa tarde..
 (@shekinahbiscuit live on http://t.co/nU1N0NJJIJ)


0     Will try to live for a week offline... wish me...
3     Boa tarde..\n (@shekinahbiscuit live on http:/...
8     Phew... Valentine's day will be finally over i...
9     up\n (@deiabiscuit32 live on http://t.co/J8uYV...
11    simmm\n (@deiabiscuit32 live on http://t.co/J8...
Name: text, dtype: object

In [10]:
# The tweets texts are in string format
type(en_tweets.text.iloc[0])

str

> Before going on with the sentiment analysis task, there is the need to preprocess the data set. Indeed, from the previous tweets texts we can see that there are many links, stops words which does not add any value to the sentiment classifier. Need to get rid of them 

## Real PREPROCESSING

---
Preprocessing id one of the essential steps in any natural language processing (NLP) task. Common preprocessing techniques are:

- **Letter casing**, that is converting all letters either upper or lower case. However, in text such as tweets often the upper case is used to emphatise a concept. For example, in the case of Sentiment Analysis, great and GREAT have a different sentiment intensity. Therefore, we will not include this step in our pipeline. In addition to that, casefolding could great confusion between words such as apple and Apple. When the dataset is large it is recommended not ot do it. 

- **Noise Removal**, that is eliminating unwanted characters such as URLs, user mentions @ and hashtags symbols #.([Shihab Elbagir and Jing Yang](http://www.iaeng.org/publication/IMECS2019/IMECS2019_pp12-16.pdf))
([Toni Pano and Rasha Kashef](https://www.mdpi.com/2504-2289/4/4/33))

- **Tokenizing**, convert character string into sequence of tokens, that is words separated by white spaces. Tokenicazion in text such as tweets is or crucial importance. Indeed, we can find punctuation marks such as *!?* or *:-)* that we will like tokeep together because they have a intrinsic meaning. We do not want to separe them. Tokenization in twitter is quite straithforward, and on Python it can be implemented with the nltk library.

- **Stemming**

- **Lemmatization**

### Removing special characters from the tweew such as URLs (“http://url”), user mention (@), and the symbol of hashtags (#) to improve the classification

In [11]:
# Importing RE (regular expression) module to replace the http with empty string. 
import re

### need to be better explained .

# 'r' added to not handle backslashes in any special way in the following string
# https? will match either ‘http’ or ‘https’
# https?:\/\/ will match any "http://" and "https://" in the following string
# \S Returns a match where the string does not contain a white space character inside. 
    # Indeed, we want to keep the test that follows the URL.
    
# * Zero or more occurrences --> I think we can delete the *, i dont see any case where it could be useful



clean_tweets = [] 
for i in range(len(en_tweets)):
    
    # Selecting one tweet
    text_i = en_tweets.text.iloc[i]
    
    # Making the tweet in lower case --> no need it
    # text_i = text_i.lower() 
    
    # Deleting the URL only
    text_i = re.sub(r'https?:\/\/\S*', '', text_i, flags = re.MULTILINE)
    
    # Deleting the user mention @username
    text_i = re.sub(r'@\S*', '', text_i, flags = re.MULTILINE)
    
    # Deleting the hashtag symbol #
    text_i = text_i.replace("#", "")
    
    # append the cleaned tweet to the list previously initialized
    clean_tweets.append(text_i) 
    

In [12]:
### WHAT \n MEANS IN SOME TWEETS???? I HAVE NO IDEA... I WOULD DELETE IT BUT IT COULD BE IMORTANT I DUNNO

In [13]:
# compare before and after preprocessing with a tweet that contains all of them.
print(en_tweets.text.iloc[8])
clean_tweets[8]

Have you been to the mountain? Reach new heights on the #SmurfVillage mountain! http://t.co/d9BVrqiU82 @BeelineGames


'Have you been to the mountain? Reach new heights on the SmurfVillage mountain!  '

In [14]:
# Add the new column to the original df

en_tweets['clean_tweets'] = clean_tweets
en_tweets

Unnamed: 0,coordinates,created_at,hashtags,media,urls,favorite_count,id,in_reply_to_screen_name,in_reply_to_status_id,in_reply_to_user_id,...,user_friends_count,user_listed_count,user_location,user_name,user_screen_name.1,user_statuses_count,user_time_zone,user_urls,user_verified,clean_tweets
0,,Sun Feb 14 23:29:33 +0000 2010,,,,0,9116606357,,,,...,17,0,"Sofia, Bulgaria",Ina Gerdjikova,InaGerdjikova,6,,http://kutiazamisli.blogspot.com/,False,Will try to live for a week offline... wish me...
3,,Tue Aug 12 17:51:37 +0000 2014,,,http://twitcam.livestream.com/g7hpe,0,499251874256920577,,,,...,9,0,,Ana Paula Ribeiro,anadadal,7,,,False,Boa tarde..\n ( live on
8,,Sun Feb 14 18:46:26 +0000 2010,,,,0,9107804301,,,,...,17,0,"Sofia, Bulgaria",Ina Gerdjikova,InaGerdjikova,6,,http://kutiazamisli.blogspot.com/,False,Phew... Valentine's day will be finally over i...
9,,Fri Sep 28 19:07:04 +0000 2012,,,http://twitcam.livestream.com/c7fe0,0,251759950655791105,,,,...,9,0,,Ana Paula Ribeiro,anadadal,7,,,False,up\n ( live on
11,,Fri Sep 28 19:05:53 +0000 2012,,,http://twitcam.livestream.com/c7fe0,0,251759653464199168,,,,...,9,0,,Ana Paula Ribeiro,anadadal,7,,,False,simmm\n ( live on
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1609378,,Thu Nov 07 19:47:01 +0000 2013,,,,0,398537090977124353,,,,...,20,0,,Christopher Chang,Mathew_C_Chang,7,,,False,carry me to fantasy victory
1609385,,Tue Jun 24 12:52:16 +0000 2014,,,,0,481419535463890945,,,,...,1528,0,♡,Logan Junior,LoganJuniorrb,11,,,False,"RT Hi lovely 💕 You are my world, my life, my..."
1609386,,Tue Jun 24 11:57:24 +0000 2014,,,,0,481405727018209280,,,,...,1528,0,♡,Logan Junior,LoganJuniorrb,11,,,False,"""Many things about Satanism are offensive."" - ..."
1609387,,Thu May 22 02:02:29 +0000 2014,ArtemisStalkerSkin,,http://bit.ly/11SPvxw,0,469297212129017856,,,,...,2,0,,Espy Brave,BraveEspy,1,,,False,Smite ArtemisStalkerSkin for Free


## Rule-based sentiment analysis - VADER

---

The VADER (Valence Aware Dictionary and sEntiment Reasoner) sentiment analysis consists of 4 columns: positive, neutral, negative, and compound polarity scores. For our case, we are only interested in the compound polarity score to determine the sentiment of the tweets

## Results with the cleaned tweets

In [15]:
#Initializing the Rule-based sentiment analyzer 
analyzer = SentimentIntensityAnalyzer()

In [None]:
en_tweets['sent_score'] = [analyzer.polarity_scores(x)['compound'] for x in en_tweets['clean_tweets']]

In [None]:
def sentiment(scores):
    sents = []
    for score in scores:
        if score >= 0.05:
            sent = 'pos'
        elif score <= -0.05:
            sent = 'neg'
        elif abs(score) <0.05:
            sent = 'neu'
        sents.append(sent)
    return sents

In [None]:
en_tweets['sentiment'] = sentiment(en_tweets.sent_score)

In [None]:
grouped_sents = en_tweets.groupby(by='sentiment')

In [None]:
print('Number of positive sentences:',sum(np.array(en_tweets['sent_score'])>=0.05))
print('Number of negative sentences:',sum(np.array(en_tweets['sent_score'])<=-0.05))
print('Number of neutral sentences:',sum(np.abs(np.array(en_tweets['sent_score']))<0.05))

In [None]:
plt.figure(figsize=(6,6))
plt.bar(grouped_sents.size().index, grouped_sents.size().values, width=0.8)
plt.xlabel('Sentiment')
plt.ylabel('Number of Tweets')
plt.show()

In [None]:
# Threshold as in the other paper 
def sentiment(scores):
    sents = []
    for score in scores:
        if score >= 0.001:
            sent = 'pos'
        elif score <= -0.001:
            sent = 'neg'
        elif abs(score) <0.001:
            sent = 'neu'
        sents.append(sent)
    return sents

en_tweets['sentiment'] = sentiment(en_tweets.sent_score)
grouped_sents2 = en_tweets.groupby(by='sentiment')

plt.figure(figsize=(6,6))
plt.bar(grouped_sents2.size().index, grouped_sents2.size().values, width=0.8)
plt.xlabel('Sentiment')
plt.ylabel('Number of Tweets')
plt.show()

In [None]:
pos_sents = en_tweets.loc[en_tweets['sentiment'] == 'pos']
neg_sents = en_tweets.loc[en_tweets['sentiment'] == 'neg']

Niave analysis on affect of sentiment on retweet count 

In [None]:
print("The average number of retweets for positive tweets is {:.2f}".format(pos_sents.retweet_count.mean()))
print("The average number of retweets for negative tweets is {:.2f}".format(neg_sents.retweet_count.mean()))

In [None]:
stats.ttest_ind(pos_sents.retweet_count, neg_sents.retweet_count)

In [None]:
stats.mannwhitneyu(pos_sents.retweet_count, neg_sents.retweet_count)

## Not sure if the statistical tests above make sense

In [None]:
pos_sents['retweet_count'].describe()

In [None]:
neg_sents['retweet_count'].describe()

In [None]:
pos_sents.sort_values(by=['retweet_count'], ascending = False)['retweet_count'].head(10)

In [None]:
neg_sents.sort_values(by=['retweet_count'], ascending = False)['retweet_count'].head(10)

When looking at simple statistics, we see that positive sentiment tweets have a much larger standard deviation, so there is more likely a significantly larger number of tweets with large retweet counts. Also, looking at the top 10 highest retweeted tweets for positive tweets we find that positive tweets have higher max retween count. So, we need to try and filter the data to avoid effects of these very popular tweets

In [None]:
pos_sents_filtered = pos_sents.loc[pos_sents['retweet_count']<1000]
neg_sents_filtered = neg_sents.loc[neg_sents['retweet_count']<1000]

In [None]:
len(neg_sents_filtered)

In [None]:
print("The average number of retweets for positive tweets is {:.2f}".format(pos_sents_filtered.retweet_count.mean()))
print("The average number of retweets for negative tweets is {:.2f}".format(neg_sents_filtered.retweet_count.mean()))

In [None]:
stats.ttest_ind(pos_sents_filtered.retweet_count, neg_sents_filtered.retweet_count)

In [None]:
stats.mannwhitneyu(pos_sents_filtered.retweet_count, neg_sents_filtered.retweet_count)

However, we may find that the tweets with 0 retweets affect the data. Let's look at tweets with at least one retweet

In [None]:
pos_sents_filtered = pos_sents.loc[(pos_sents['retweet_count']<1000) & (pos_sents['retweet_count'])>0]
neg_sents_filtered = neg_sents.loc[(neg_sents['retweet_count']<1000) & (neg_sents['retweet_count']>0)]

In [None]:
print("The average number of retweets for positive tweets is {:.2f}".format(pos_sents_filtered.retweet_count.mean()))
print("The average number of retweets for negative tweets is {:.2f}".format(neg_sents_filtered.retweet_count.mean()))

In [None]:
stats.ttest_ind(pos_sents_filtered.retweet_count, neg_sents_filtered.retweet_count)

In [None]:
stats.mannwhitneyu(pos_sents_filtered.retweet_count, neg_sents_filtered.retweet_count)

In [None]:
plt.figure(figsize=(8,6))

plt.hist([pos_sents_filtered.retweet_count, neg_sents_filtered.retweet_count], bins=50, label=['Positive Sentiment', 'Negative Sentiment'])
plt.legend(loc = 'upper right')
plt.title('Retweet count for positive and negative sentiment tweets')
plt.xlabel('Retweet Count')
plt.ylabel('Number of Tweets')
plt.yscale('log')
plt.xlim([0, 1000])
plt.show()

In [None]:
pos_same_size = pos_sents_filtered.sample(len(neg_sents_filtered))

In [None]:
plt.figure(figsize=(8,6))

plt.hist([pos_same_size.retweet_count, neg_sents_filtered.retweet_count], bins=50, label=['Positive Sentiment', 'Negative Sentiment'])
plt.legend(loc = 'upper right')
plt.title('Retweet count for positive and negative sentiment tweets')
plt.xlabel('Retweet Count')
plt.ylabel('Number of Tweets')
plt.yscale('log')
plt.xlim([0, 1000])
plt.show()

## Results with the raw tweets

In [None]:
#Initializing the Rule-based sentiment analyzer 
analyzer = SentimentIntensityAnalyzer()

In [None]:
en_tweets['sent_score'] = [analyzer.polarity_scores(x)['compound'] for x in en_tweets['text']]

In [None]:
def sentiment(scores):
    sents = []
    for score in scores:
        if score >=0.05:
            sent = 'pos'
        elif score <=-0.05:
            sent = 'neg'
        elif abs(score) <0.05:
            sent = 'neu'
        sents.append(sent)
    return sents

In [None]:
en_tweets['sentiment'] = sentiment(en_tweets.sent_score)

In [None]:
grouped_sents = en_tweets.groupby(by='sentiment')

In [None]:
print('Number of positive sentences:',sum(np.array(en_tweets['sent_score'])>=0.05))
print('Number of negative sentences:',sum(np.array(en_tweets['sent_score'])<=-0.05))
print('Number of neutral sentences:',sum(np.abs(np.array(en_tweets['sent_score']))<0.05))

In [None]:
plt.figure(figsize=(6,6))
plt.bar(grouped_sents.size().index, grouped_sents.size().values, width=0.8)
plt.xlabel('Sentiment')
plt.ylabel('Number of Tweets')
plt.show()

# Bert SA-pipeline

In [None]:
sa_pipeline = pipeline('sentiment-analysis')

In [None]:
for i in range(10):
    text = en_tweets['text'].iloc[i]
    print(text)
    prediction = sa_pipeline(text)
    print(prediction)
    print()