In [None]:
#!pip install tweepy

In [None]:
#!pip install TextBlob

In [None]:
#!pip install nltk

In [None]:
#!pip install spacy

In [None]:

#Importing libraries

import tweepy as tw
import pandas as pd
import seaborn as sns
import time
import re
import string
import nltk
import matplotlib.pyplot as plt
plt.rc('figure',figsize=(17,13))
import seaborn as sns
sns.set_style('darkgrid')
import plotly.express as px
from plotly.subplots import make_subplots
nltk.download('vader_lexicon')
from nltk.sentiment.vader import SentimentIntensityAnalyzer as SIA
#from wordcloud import WordCloud,STOPWORDS, ImageColorGenerator
from nltk.corpus import stopwords
from nltk.stem import WordNetLemmatizer
import datetime
import warnings
warnings.filterwarnings("ignore")
print("Library Setup Complete.")

In [None]:
import warnings
warnings.filterwarnings('ignore')

In [None]:
#Importing twitter credentials for scrapping the data

consumer_key = ""
consumer_secret = ""
access_key = ""
access_secret = ""

In [None]:
auth = tw.OAuthHandler(consumer_key, consumer_secret) 
auth.set_access_token(access_key, access_secret) 
api = tw.API(auth, wait_on_rate_limit=True)

In [None]:
number_of_tweets = 1500
tweets = []
likes = []
time = []

for i in tw.Cursor(api.search_tweets, q = "#Moderna -filter:retweets", lang = 'en', tweet_mode = "extended").items(number_of_tweets):
    tweets.append(i.full_text)
    likes.append(i.favorite_count)
    time.append(i.created_at)

In [None]:
tweets

In [None]:
#converting text to dataframe

dataframe = pd.DataFrame({'tweets' : tweets, 'likes' : likes, 'time' : time})

In [None]:
dataframe

In [None]:
dataframe.head()

In [None]:
dataframe.tail()

In [None]:
dataframe.dtypes

In [None]:
dataframe.shape

In [None]:
# Examining statistics

dataframe.describe()

In [None]:
figure = px.histogram(dataframe, x='time', template='plotly_white', title='Number of tweets about moderna per day')
figure.update_xaxes(categoryorder='category descending', title='Date').update_yaxes(title='Number of tweets about moderna per day')
figure.show()

In [None]:
# Looking for unfilled values

dataframe.isnull().sum()

In [None]:
#Removing Re-tweets

no_rt_df = dataframe[~dataframe.tweets.str.contains("RT")]

In [None]:
no_rt_df

In [None]:
#Resetting Index

new_df = no_rt_df.reset_index(drop = True)

In [None]:
new_df

In [None]:
#Determining Most liked tweets

most_liked_tweets = new_df.loc[dataframe.likes.nlargest(10).index]

In [None]:
most_liked_tweets

In [None]:
#Cleaning the text

def cleanUpTweet(text):
    # Remove mentions
    text = re.sub(r'@[A-Za-z0-9_]+', '', text)
    # Remove hashtags
    text = re.sub(r'#', '', text)
    # Remove retweets:
    text = re.sub(r'RT : ', '', text)
    # Remove urls
    text = re.sub(r'https?:\/\/[A-Za-z0-9\.\/]+', '', text) 
    #removes stop words
    text = re.sub(r'the', '', text)
    text = re.sub(r'and', '', text)
    text = re.sub(r'to', '', text)
    text = re.sub(r'\s+', ' ', text, flags=re.I)
    return text

new_df['tweets'] = new_df['tweets'].apply(cleanUpTweet)
#new_df["tweets"]

In [None]:
# Stopword Removal

from nltk.corpus import stopwords

", ".join(stopwords.words('english'))
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    """custom function to remove the stopwords"""
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])


new_df['tweets'] = new_df['tweets'].apply(remove_stopwords)
new_df["tweets"]

In [None]:
#Removing Emoji

def remove_emoji(string):
    emoji_pattern = re.compile("["
                               u"\U0001F600-\U0001F64F"  # emoticons
                               u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                               u"\U0001F680-\U0001F6FF"  # transport & map symbols
                               u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                               u"\U00002500-\U00002BEF"  # chinese char
                               u"\U00002702-\U000027B0"
                               u"\U00002702-\U000027B0"
                               u"\U000024C2-\U0001F251"
                               u"\U0001f926-\U0001f937"
                               u"\U00010000-\U0010ffff"
                               u"\u2640-\u2642"
                               u"\u2600-\u2B55"
                               u"\u200d"
                               u"\u23cf"
                               u"\u23e9"
                               u"\u231a"
                               u"\ufe0f"  # dingbats
                               u"\u3030"
                               "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)


new_df["tweets"] = new_df["tweets"].apply(str)
new_df["tweets"] = new_df["tweets"].apply(remove_emoji)
new_df["tweets"]
new_df

In [None]:
new_df["tweets"]

In [None]:
#Counting most occured words

from collections import Counter

cnt = Counter()

for text in new_df["tweets"].values:
    for word in text.split():
        cnt[word] += 1
        
cnt.most_common(10)

In [None]:
#Splitting data into tokens

def tokenization(text):
    text = re.split('\W+', text)
    return text
new_df['tokenized'] = new_df['tweets'].apply(tokenization)
new_df['tokenized']

In [None]:
#lemmatization

wn = nltk.WordNetLemmatizer()
def lemmatizer(text):
    text = [wn.lemmatize(word) for word in text]
    return text
new_df['lemmatized'] = new_df['tokenized'].apply(lambda x: lemmatizer(x))
new_df.head()
new_df.tail()

In [None]:
import numpy as np

all_words=[]
for i in range(len(new_df['lemmatized'])):
    a=new_df['lemmatized'][i]
    for i in a:
        all_words.append(i)
all_words=pd.Series(np.array(all_words))

common_words=all_words.value_counts()[:50].rename_axis('Common Words').reset_index(name='count')

fig = px.treemap(common_words, path=['Common Words'], values='count',title='50 Most Common Words In Tweets')
fig.show()


In [None]:
new_df.drop(["time"], axis = 1, inplace = True)

In [None]:
new_df.drop(["likes"], axis = 1, inplace = True)

In [None]:
new_df.drop(["tokenized"], axis = 1, inplace=True)

In [None]:
new_df.drop(["tweets"], axis = 1, inplace=True)

In [None]:
new_df

In [None]:
new_df["lemmatized"] = new_df["lemmatized"].apply(str)
new_df["lemmatized"]

In [None]:
#new_df

In [None]:
from textblob import TextBlob

In [None]:
#creates a function that determines subjectivity and polarity from the textblob package

def getTextSubjectivity(text):
    return TextBlob(text).sentiment.subjectivity
def getTextPolarity(text):
    return TextBlob(text).sentiment.polarity

In [None]:
#apply these functions to the dataframe

new_df['Subjectivity'] = new_df['lemmatized'].apply(getTextSubjectivity)
new_df['Polarity'] = new_df['lemmatized'].apply(getTextPolarity)

In [None]:
new_df

In [None]:
#builds a function to calculate and categorize each tweet as Negative, Neutral, and Positive

def getTextAnalysis(a):
    if a < 0:
        return "Negative"
    elif a == 0:
        return "Neutral"
    else:
        return "Positive"


In [None]:
#creates another column called Score and applies the function to the dataframe

new_df['Score'] = new_df['Polarity'].apply(getTextAnalysis)

In [None]:
new_df

In [None]:
#calculates percentage of positive, negative, and neutral tweets of 1000 tweets

positive = new_df[new_df['Score'] == 'Positive']
print(str(positive.shape[0]/(new_df.shape[0])*100) + " % of positive tweets")
positive = new_df[new_df['Score'] == 'Neutral']
print(str(positive.shape[0]/(new_df.shape[0])*100) + " % of neutral tweets")
positive = new_df[new_df['Score'] == 'Negative']
print(str(positive.shape[0]/(new_df.shape[0])*100) + " % of negative tweets")

In [None]:
#Percentages of sentiment for 500 tweets

#41.85110663983904 % of positive tweets
#41.85110663983904 % of neutral tweets
#16.297786720321934 % of negative tweets