:The objectives of this analysis is to analyse the data that has been collected and get meaningful insights from it.

# Importing libraries

In [None]:
#Importing the libraries to be used
#import os
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
import seaborn as sns
import itertools
import collections
import re

#import panel as pn
#pn.extension('tabulator')
#!pip install hvplot
#import hvplot.pandas

import tweepy as tw
from textblob import TextBlob
#!pip install pygal
#import pygal
!pip install vaderSentiment
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
import nltk
nltk.download('punkt')
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import TweetTokenizer 
from nltk import bigrams
from textblob import TextBlob
import networkx # for creating networknodes
import networkx as nx
#from pandas.io.json import json_normalize

import warnings
warnings.filterwarnings("ignore")

sns.set(font_scale=1.5)
sns.set_style("whitegrid")

# Reading the Dataset and Getting Info about it


In [None]:
#Reading the dataset
df = pd.read_csv('dataset_twitter-scraper.csv')
df.rename(columns={"full_text": "text"}, inplace=True)
df.head()

In [None]:
# we shall be working with text and created_at columns.
df = df[["text" ,"created_at"]]
df.head(4)

In [None]:
#Checking the info of data
df.info()

In [None]:
#Removing the duplicates
df = df.drop_duplicates()

In [None]:
#Convert the created_at column to datetime datatype
df['created_at'] = df['created_at'].astype('datetime64[ns]')
df.info()

## The next step, it shows the top 5 text with more reactions:

In [None]:
# top 5 texts
reactions = df.iloc[:,[0,1]].groupby(['text']).count()
reactions.sort_values(by=['created_at'],ascending=False).iloc[0:5, :]


## Checking the Date ranges and the peak hours

In [None]:
#Creating a column for hour
df['hour'] = df['created_at'].dt.hour
#Creating a column for days
df['date'] = df['created_at'].dt.date
#Creating a column for month
df['month'] = df['created_at'].dt.month
df.head()

In [None]:
#Checking the unique dates
df['date'].value_counts()#Checking the unique dates

In [None]:
# time series showing when the tweets for this analysis was created
reactions = df.groupby(['date']).count()
ax = reactions.text.plot(figsize=(15,6),ls='--',c='red')
plt.ylabel('The Count of tweets collected')
plt.title('A Trend on the counts of tweets and the dates created')
ax.xaxis.grid(True)
ax.yaxis.grid(True)

In [None]:
reactions = df.groupby(['hour']).count().sort_values(by='created_at',ascending=0)
reactions.head()

In [None]:
# time series plot for the most active hours for tweeting
reactions = df.groupby(['hour']).count()
ax = reactions.text.plot(figsize=(15,6),ls='--',c='green')
plt.ylabel('The Count of tweets collected')
plt.title('A Trend on the counts of tweets and the hours created')
ax.xaxis.grid(True)
ax.yaxis.grid(True)

## Exploratory Data Analysis (EDA)

In [None]:
#Creating a copy for the text column This will enable us work with the text column solely
df_tweets = df[['text']].copy()
df_tweets.tail(5)

In [None]:
#Dropping the duplicates
df_tweets = df_tweets.drop_duplicates()

In [None]:
df_tweets.tail(5)


## Text Processing

In [None]:
#A Function for cleaning the file (The text column in it)
def text_clean(df_tweets):
  #Lowercasing all the letters
  df_tweets['text'] = df_tweets['text'].str.lower() 

  #Removes mentions containing rt word
  df_tweets['text'] = df_tweets['text'].str.replace(r'rt @[A-Za-z0-9_]+:', '', regex=True) 
  #Removes mention just containing @word only
  df_tweets['text'] = df_tweets['text'].str.replace(r'@[A-Za-z0-9_]+', '', regex=True) 
  #Removing #tags 
  #df_tweets['text'] = df_tweets['text'].str.replace(r'#[A-Za-z0-9_]+', '', regex=True)  

  #Removing links
  df_tweets['text'] = df_tweets['text'].str.replace(r'http\S+', '', regex=True)
  df_tweets['text'] = df_tweets['text'].str.replace(r'www.\S+', '', regex=True) 

  #Removing punctuations and replacing with a single space
  df_tweets['text'] = df_tweets['text'].str.replace(r'[()!?]', ' ', regex=True)  
  df_tweets['text'] = df_tweets['text'].str.replace(r'\[.*?\]', ' ', regex=True)

  #Filtering non-alphanumeric characters
  df_tweets['text'] = df_tweets['text'].str.replace(r'[^a-z0-9]', ' ', regex=True) 

  #Removing Stoping words + keywords_to_hear
  stop = stopwords.words('english') + ['n', '2','5', '000'] 
  df_tweets['tweet_without_stopwords'] = df_tweets['text'].apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [None]:
#preview of the data before cleaning
df_tweets.head()

In [None]:
text_clean(df_tweets)
# tokenize the tweets
df_tweets['tokenized_sents'] = df_tweets.apply(lambda row: nltk.word_tokenize(row['tweet_without_stopwords']), axis=1)
df_tweets.head() #preview of the data after cleaning

### Visualizing/InfoGraphics the text column (Unigram)

In [None]:
# Create a list of lists containing words for each tweet
words_in_tweet = list(df_tweets['tokenized_sents'])
words_in_tweet[:2]

In [None]:
#Calculate word frequencies
# List of all words across tweets
all_words = list(itertools.chain(*words_in_tweet))

# Create counter
counts_words = collections.Counter(all_words)

counts_words.most_common(15)

In [None]:
# transform the list into a pandas dataframe
df_counts_words = pd.DataFrame(counts_words.most_common(15),
                             columns=['words', 'count'])

df_counts_words.head(10)

In [None]:
#A horizontal bar graph to visualize the most common words
fig, ax = plt.subplots(figsize=(10, 8))

# Plot horizontal bar graph
df_counts_words.sort_values(by='count').plot.barh(x='words',
                      y='count',
                      ax=ax,
                      color="green")

ax.set_title("Common Words Found in Tweets ")
plt.savefig('count_unigram.png')
plt.show()

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
wordcloud2 = WordCloud(background_color="black",max_words=100,width=3000, height=2000,repeat=True).generate(' '.join(df_counts_words["words"]))
# Generate plot
plt.tight_layout(pad=0)
plt.figure(figsize=(10,7), facecolor='k')
plt.imshow(wordcloud2)
plt.axis("off",interpolation="bilinear")
plt.savefig('cloud_uni.png')
plt.show()

## Collection of Words – Bigrams

In [None]:
#Create a list of tokenized_sents
tweets_words = list(df_tweets['tokenized_sents'])
tweets_words[2]

In [None]:
#Remove any empty lists
tweets_words_new = [x for x in tweets_words if x != []]
tweets_words_new[2]

In [None]:
# Create list of lists containing bigrams in tweets
terms_bigram = [list(bigrams(tweet)) for tweet in tweets_words_new]

# View bigrams for the first tweet
terms_bigram[2]

In [None]:
# Flatten list of bigrams in clean tweets
bigrams = list(itertools.chain(*terms_bigram))

# Create counter of words in clean bigrams
bigram_counts = collections.Counter(bigrams)

bigram_counts.most_common(20)

In [None]:
#Creating a dataframe of the most common bigrams
bigram_df = pd.DataFrame(bigram_counts.most_common(20),
                             columns=['bigram', 'count'])

bigram_df

## Visualize Networks of Bigrams

In [None]:
# Create dictionary of bigrams and their counts
d = bigram_df.set_index('bigram').T.to_dict('records')

# Create network plot 
G = nx.Graph()

# Create connections between nodes
for k, v in d[0].items():
    G.add_edge(k[0], k[1], weight=(v * 10))

In [None]:
fig, ax = plt.subplots(figsize=(20, 15))

pos = nx.spring_layout(G, k=2)

# Plot networks
nx.draw_networkx(G, pos,
                 font_size=16,
                 width=3,
                 edge_color='red',
                 node_color='black',
                 with_labels = False,
                 ax=ax)

# Create offset labels
for key, value in pos.items():
    x, y = value[0]+.135, value[1]+.045
    ax.text(x, y,
            s=key,
            bbox=dict(facecolor='aqua', alpha=0.25),
            horizontalalignment='center', fontsize=17)
plt.title('Visualize Networks of Bigrams')  
plt.savefig('bigrams_network.png')
plt.show()

# Polarity

In [None]:
#Function to get the subjectivity Subjectivity refers to an individual's feelings, opinions, or preferences.
def getSubjectivity(text):
  return TextBlob(text).sentiment.subjectivity

#Create a function to get the polarity (Tells how positive or negative the text is)
def getPolarity(text):
  return TextBlob(text).sentiment.polarity

#Create two new columns
df['Subjectivity'] = df['text'].apply(getSubjectivity)
df['Polarity'] = df['text'].apply(getPolarity)

#show the new dataframe with columns
df

In [None]:
#plot the WordCloud
allwords  = ' '.join([txts for txts in df['text']])
wordCloud = WordCloud(width = 500, height = 300, random_state = 21, max_font_size=119).generate(allwords)

plt.imshow(wordCloud, interpolation='bilinear')
plt.axis('off')
plt.show()

In [None]:
#Create fxn to compute negative , neutral and positive analysis
def getAnalysis(score):
  if score < 0:
    return 'Negative'
  elif score == 0:
    return 'Neutral'
  else:
    return 'Positive'

df['Analysis'] = df['Polarity'].apply(getAnalysis)

#Show the dataframe
df

In [None]:
sortedDF = df.sort_values(by='Polarity')
sortedDF

In [None]:
#Plot the polarity and subjectivity
plt.figure(figsize=(28,10))
for i in range(0, 247): #The range is the number of rows in our dataset
  plt.scatter(df['Polarity'][i], df['Subjectivity'][i], color='black')
plt.title("Sentiment Analysis Distribution")
plt.xlabel('Polarity')
plt.ylabel('Subjectivity')
plt.show()

In [None]:
#Get the Percentage of positive tweets
ptweets = df[df.Analysis == 'Positive']
ptweets = ptweets['text']

round((ptweets.shape[0] / df.shape[0]) * 100, 1)

In [None]:
#Get the Percentage of negative tweets
ntweets = df[df.Analysis == 'Negative']
ntweets = ntweets['text']

round((ntweets.shape[0] / df.shape[0]) * 100, 1)

In [None]:
#Show the Value counts
sns.countplot(x='Analysis', data=df)

#plot and visualize the counts
plt.title('Sentiment Analysis')
plt.xlabel('Sentiment')
plt.ylabel('Counts')
plt.show()

# polarity ( positive, negative , and neutral scores for each tweet)

In [None]:
'''using polarity_scores() we, 
will find all the positive, negative, and neutral scores for each tweet.'''
from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
analyzer = SentimentIntensityAnalyzer()

scores = []
# Declare variables for scores
compound_list = []
positive_list = []
negative_list = []
neutral_list = []
for i in range(df['text'].shape[0]):
#print(analyser.polarity_scores(sentiments_pd['text'][i]))
    compound = analyzer.polarity_scores(df['text'][i])["compound"]
    pos = analyzer.polarity_scores(df['text'][i])["pos"]
    neu = analyzer.polarity_scores(df['text'][i])["neu"]
    neg = analyzer.polarity_scores(df['text'][i])["neg"]
    
    scores.append({"Compound": compound,
                       "Positive": pos,
                       "Negative": neg,
                       "Neutral": neu
                  })

Converting the scores dictionary containing the scores into the data frame, then join the sentiments_score data frame with the df data frame.

In [None]:
sentiments_score = pd.DataFrame.from_dict(scores)
df = df.join(sentiments_score)
df.head()

In [None]:
#Finding the percentages of +ve, -ve and neutral
#Calculating the percentages
def percentage_polarity(part, whole_data):
  percentage = 100 * float(part) / float(whole_data)
  return round(percentage, 1)

negative = 0
positive = 0
neutral = 0

for index, row in df.iterrows():
  neg = row['Negative']
  pos = row['Positive']
  if neg > pos :
    negative += 1
    negative_list.append(df.text)
  elif pos > neg :
    positive += 1
  elif pos == neg:
    neutral += 1

positive_percentage = percentage_polarity(positive, df.shape[0])
negative_percentage = percentage_polarity(negative, df.shape[0])
neutral_percentage = percentage_polarity(neutral, df.shape[0])

print(f"Negative : Counts {negative} Its Percentage = {negative_percentage}%")
print(f"positive : Counts {positive}  Its Percentage = {positive_percentage}%")
print(f"neutral : Counts {neutral}  Its Percentage = {neutral_percentage}%")

In [None]:
#Creating PieCart
labels = ['Positive ['+str(positive_percentage)+'%]' , 'Neutral ['+str(neutral_percentage)+'%]','Negative ['+str(negative_percentage)+'%]']
sizes = [positive_percentage, neutral_percentage, negative_percentage]
colors = ['black', 'green','red']
my_circle=plt.Circle( (0,0), 0.5, color='white')
patches, texts = plt.pie(sizes,colors=colors, startangle=90)
p=plt.gcf()
p.gca().add_artist(my_circle)
plt.style.use('default')
plt.legend(labels)
plt.title("Sentiment Analysis Result " )
plt.axis('equal')
plt.show()

# plotting wordcloud for positive, neutral and negative

In [None]:
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
def word_cloud(wd_list):
    stopwords = set(STOPWORDS)
    all_words = ' '.join([text for text in wd_list])
    wordcloud = WordCloud(
        background_color='black',
        stopwords=stopwords,
        width=1600,
        height=800,
        random_state=1,
        colormap='jet',
        max_words=80,
        max_font_size=200).generate(all_words)
    plt.figure(figsize=(12, 10))
    plt.axis('off')
    plt.imshow(wordcloud, interpolation="bilinear");
word_cloud(df['text'])

In [None]:
#Negative sentiment word cloud
word_cloud(df['text'][df['Positive'] < df['Negative']])

In [None]:
#Positive sentiment word cloud
word_cloud(df['text'][df['Positive'] > df['Negative']])

In [None]:
#Neutral cloud
word_cloud(df['text'][df['Positive'] == df['Negative']])


 Wordcloud is the informative visual representation of text datasets, highlighting the most popular and trending keywords in text datasets based on the frequency of occurrence and importance.