In [1]:
#Spark
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark import SparkConf

#Hadoop
from hdfs import InsecureClient

#Spark SQL functions
from pyspark.sql.functions import *
from pyspark.sql.functions import from_utc_timestamp, udf, array_distinct, col, when
from pyspark.sql.functions import regexp_replace, year, month, dayofmonth, hour, format_string
from pyspark.sql.functions import monotonically_increasing_id

# Spark Datatypes
from pyspark.sql.types import StringType, TimestampType, DateType, IntegerType
from pyspark.sql.types import DoubleType, StructType, FloatType, StructField


#Pandas
import pandas as pd
import json
import emoji
import stylecloud
from collections import Counter


## Warnings conf

In this sections the warnings are suppressed, less logs while running the code

In [2]:
import warnings

# Suppressing the warnings
warnings.filterwarnings('ignore') 


# Data Loading

## Reading Data from Hadoop

In this section we are reading data from Hadoop, using a Spark Session.

### Spark configurations and Connection

In [3]:
def spark_start(restart = False, appname="HadoopAccess"):

    if restart == True:
        spark.stop()
    
    # Configuration parameters for Spark
    spark_conf = SparkConf().setMaster("local[*]").setAppName(appname)

    # Using SparkSession
    spark = SparkSession.builder.config(conf=spark_conf).config('spark.sql.session.timeZone', 'UTC').getOrCreate()
    
    return spark


spark = spark_start()

 # this will help not to have too much error displaying
sc = spark.sparkContext
sc.setLogLevel('ERROR')


2023-05-30 17:06:30,410 WARN util.Utils: Your hostname, BDS-2023 resolves to a loopback address: 127.0.1.1; using 192.168.0.110 instead (on interface wlo1)
2023-05-30 17:06:30,411 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-05-30 17:06:31,248 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Creating Models in Spark

In [4]:
#Reading all parquets file in the folder tweets on Hadoop
tweets_pred_saved = spark.read.parquet("/CA4/sentiment/**/*.parquet")

                                                                                

## Binary Sentiment

### creating a binary sentiment

The binary sentiment is being created using the score to define whether it is negative(0) or positive(1).

In [5]:
#getting sentiment
tweets_fc_df = tweets_pred_saved.withColumn('sentiment',when(col("score") > 0, '1').otherwise('0'))

In [6]:
tweets_fc_df.printSchema()

root
 |-- created_at: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- cleaned_text: string (nullable = true)
 |-- entities: string (nullable = true)
 |-- prediction: double (nullable = true)
 |-- textblob: string (nullable = true)
 |-- vader: string (nullable = true)
 |-- score: double (nullable = true)
 |-- sentiment: string (nullable = false)



In [7]:
tweets_fc = tweets_fc_df.withColumn("RT", when(col("text").startswith("RT"), 1).otherwise(0))

In [8]:
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

from pyspark.sql.functions import year, month, dayofmonth, hour, col, mean, count, to_date
from pyspark.sql.functions import sum as spark_sum


tweets_hour = tweets_fc.groupBy(year("created_at").alias("year"), 
                               month("created_at").alias("month"),
                               dayofmonth("created_at").alias("day"),
                               to_date(col("created_at")).alias("date"),
                               hour("created_at").alias("hour"),
                               "sentiment") \
                      .agg(mean(col("score")).alias("hourly_score"),
                           count(col("score")).alias("count_score"),
                           spark_sum(col("RT")).alias("sum_RT"))\
                      .orderBy("year", "month", "day", "hour")
tweets_hour.printSchema()
tweets_hour.show(2)

tweets_hour_b = tweets_fc.groupBy(year("created_at").alias("year"), 
                               month("created_at").alias("month"),
                               dayofmonth("created_at").alias("day"),
                               to_date(col("created_at")).alias("date"),
                               hour("created_at").alias("hour"),
                               ) \
                      .agg(mean(col("score")).alias("hourly_score"),
                           count(col("score")).alias("count_score"),
                           spark_sum(col("RT")).alias("sum_RT"))\
                      .orderBy("year", "month", "day", "hour")
tweets_hour_b.printSchema()


tweets_hour_b.show(2)

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- sentiment: string (nullable = false)
 |-- hourly_score: double (nullable = true)
 |-- count_score: long (nullable = false)
 |-- sum_RT: long (nullable = true)



                                                                                

+----+-----+---+----------+----+---------+--------------------+-----------+------+
|year|month|day|      date|hour|sentiment|        hourly_score|count_score|sum_RT|
+----+-----+---+----------+----+---------+--------------------+-----------+------+
|2021|   11|  1|2021-11-01|   0|        0|-0.11364917380729236|         97|    67|
|2021|   11|  1|2021-11-01|   0|        1|   0.216212583461736|         59|    46|
+----+-----+---+----------+----+---------+--------------------+-----------+------+
only showing top 2 rows

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- hourly_score: double (nullable = true)
 |-- count_score: long (nullable = false)
 |-- sum_RT: long (nullable = true)





+----+-----+---+----------+----+--------------------+-----------+------+
|year|month|day|      date|hour|        hourly_score|count_score|sum_RT|
+----+-----+---+----------+----+--------------------+-----------+------+
|2021|   11|  1|2021-11-01|   0|0.011106234390609392|        156|   113|
|2021|   11|  1|2021-11-01|   1|0.006761358632407023|        155|   110|
+----+-----+---+----------+----+--------------------+-----------+------+
only showing top 2 rows



                                                                                

In [9]:
total_tweets = tweets_fc.count()
print(f"Total of Tweets:{total_tweets:,}")

days = tweets_fc.select(to_date(col("created_at")).alias("data")).agg(countDistinct("data").alias("total_dias")).first()["total_dias"]
mean_tweets = total_tweets / days
print(f"Average of {mean_tweets:,.0f} tweets per day ({days} total)")


Total of Tweets:763,266
Average of 2,091 tweets per day (365 total)


# Working with all Tweets

## Exporting to Pandas

In [10]:
tweets_fc_pd = tweets_fc.withColumn("created_at", date_format("created_at", "yyyy-MM-dd HH:mm:ss")).toPandas()
tweets_fc_pd['entities'] = tweets_fc_pd['entities'].str.replace('\\\\', '\\')

                                                                                

## Parsing Json

In [11]:
tweets_fc_pd['entities'][1]

'"{\\"hashtags\\": [], \\"urls\\": [{\\"url\\": \\"https://t.co/iK0Wcd6juB\\", \\"expanded_url\\": \\"https://twitter.com/i/web/status/1466194552960827398\\", \\"display_url\\": \\"twitter.com/i/web/status/1\\u2026\\", \\"indices\\": [117, 140]}], \\"user_mentions\\": [{\\"screen_name\\": \\"GLHolcombe\\", \\"name\\": \\"\\u2693Slowboat\\u2693\\", \\"id\\": 4919735069, \\"id_str\\": \\"4919735069\\", \\"indices\\": [0, 11]}, {\\"screen_name\\": \\"ucjobee\\", \\"name\\": \\"Joey\\ud83c\\udfb6Let\'s Go Brandon\\ud83d\\udce3\\", \\"id\\": 24134872, \\"id_str\\": \\"24134872\\", \\"indices\\": [12, 20]}, {\\"screen_name\\": \\"SirajAHashmi\\", \\"name\\": \\"Siraj Hashmi\\", \\"id\\": 1129762164, \\"id_str\\": \\"1129762164\\", \\"indices\\": [21, 34]}, {\\"screen_name\\": \\"GayPatriotFL\\", \\"name\\": \\"Homo For The Holidays\\", \\"id\\": 1453715178248278027, \\"id_str\\": \\"1453715178248278027\\", \\"indices\\": [35, 48]}], \\"symbols\\": []}"'

In [12]:
def extract_entities(text):
    try:
        json_data = json.loads(text)
        if isinstance(json_data, str):
            json_data = json.loads(json_data)
        hashtags = len(json_data['hashtags'])
        urls = len(json_data['urls'])
        user_mentions = len(json_data['user_mentions'])
        symbols = len(json_data['symbols'])
        error = 0
        
        return json_data, hashtags, urls, user_mentions, symbols, error
    
    except (json.JSONDecodeError, AttributeError):
        json_data = '{"hashtags":[], "urls":[], "user_mentions":[], "symbols":[]}'
        hashtags = 0
        urls = 0
        user_mentions = 0
        symbols = 0
        error = 0 

     
        return json_data, hashtags, urls, user_mentions, symbols, error

In [13]:
extracted_data = tweets_fc_pd['entities'].apply(lambda x: extract_entities(x))
tweets_fc_pd[['json', 'hashtags_c', 'urls_c', 'user_mentions_c', 'symbols_c', 'error']] = pd.DataFrame(extracted_data.tolist(), index=tweets_fc_pd.index)

del extracted_data
tweets_fc_pd.sample(2)

Unnamed: 0,created_at,text,cleaned_text,entities,prediction,textblob,vader,score,sentiment,RT,json,hashtags_c,urls_c,user_mentions_c,symbols_c,error
345803,2021-11-03 21:15:59,RT @denisrancourt: BREAKING NEWS:\nBrave Ontar...,BREAKING NEWS Brave Ontario hospital workers ...,"{""urls"": [{""url"": ""https://t.co/MOAJEllEq6"", ""...",0.0,0.8,0.4404,0.46515,1,1,"{'urls': [{'url': 'https://t.co/MOAJEllEq6', '...",0,1,1,0,0
439868,2022-02-20 16:27:23,RT @ElectionWiz: JUST IN: 5th Circuit Court of...,JUST IN 5th Circuit Court of Appeals finds Un...,"{""urls"": [], ""symbols"": [], ""hashtags"": [], ""u...",0.0,-0.5,0.0772,-0.15855,0,1,"{'urls': [], 'symbols': [], 'hashtags': [], 'u...",0,0,1,0,0


In [14]:
print(tweets_fc_pd['entities'][608409])
print(tweets_fc_pd['json'][608409])

{"urls": [], "symbols": [], "hashtags": [], "user_mentions": [{"id": 2374676774, "name": "Kelly Flynn", "id_str": "2374676774", "indices": [3, 16], "screen_name": "KellyFlynnnn"}]}
{'urls': [], 'symbols': [], 'hashtags': [], 'user_mentions': [{'id': 2374676774, 'name': 'Kelly Flynn', 'id_str': '2374676774', 'indices': [3, 16], 'screen_name': 'KellyFlynnnn'}]}


In [15]:
print("Sum of")
print(f"hashtags: {tweets_fc_pd['hashtags_c'].sum()}")
print(f"urls: {tweets_fc_pd['urls_c'].sum()}")
print(f"user_mentions: {tweets_fc_pd['user_mentions_c'].sum()}")
print(f"symbols: {tweets_fc_pd['symbols_c'].sum()}")
print(f"error: {tweets_fc_pd['error'].sum()}")

Sum of
hashtags: 120453
urls: 173752
user_mentions: 872978
symbols: 996
error: 0


In [16]:
expanded_data = pd.json_normalize(tweets_fc_pd['json'])

tweets_fc_pd = pd.concat([tweets_fc_pd, expanded_data], axis = 1)

del expanded_data
tweets_fc_pd['created_at'] = pd.to_datetime(tweets_fc_pd['created_at'])
tweets_fc_pd['sentiment'] = tweets_fc_pd['sentiment'].astype(int)
tweets_fc_pd.sample(2)

Unnamed: 0,created_at,text,cleaned_text,entities,prediction,textblob,vader,score,sentiment,RT,...,hashtags_c,urls_c,user_mentions_c,symbols_c,error,hashtags,urls,user_mentions,symbols,media
318893,2021-11-10 09:13:22,@CoinVaccine Nice project #VAX #VaccineCoin #B...,Nice project VAX VaccineCoin BEP20 BSC,"{""urls"": [], ""symbols"": [], ""hashtags"": [{""tex...",0.0,0.6,0.4215,0.383062,1,0,...,4,0,1,0,0,"[{'text': 'VAX', 'indices': [26, 30]}, {'text'...",[],"[{'id': 1384079967214997508, 'name': 'Vaccine ...",[],
48844,2021-12-22 18:44:25,RT @joncoopertweets: BIG: The US Army is devel...,BIG The US Army is developing a SINGLEdose Co...,"""{\""hashtags\"": [], \""urls\"": [], \""user_menti...",0.0,0.0666666666666666,0.0,0.025,1,1,...,0,0,1,0,0,[],[],"[{'screen_name': 'joncoopertweets', 'name': 'J...",[],


## Tops 10

### User Mentions

In [17]:
def get_top(col, top_of, top_of_rename = None, sentiment = None, top_number = None, df = tweets_fc_pd):
    
    if sentiment == None:
        df = pd.DataFrame(df[col].explode().dropna().tolist())
    else: 
        df = pd.DataFrame(df[df.sentiment == sentiment][col].explode().dropna().tolist())
        
        
    if top_number == None:
        top = df[top_of].value_counts() 
    else:
        top = df[top_of].value_counts().head(top_number)
        
    df = pd.DataFrame(top)
    df.reset_index(drop = False, inplace=True)
    df = df.rename(columns={top_of : top_of if top_of_rename == None else top_of_rename,
                               'count': 'Value'})
    
    return df 



            

In [18]:
get_top(col = 'user_mentions', top_of = 'screen_name', top_of_rename = 'Screen Name', sentiment = 1, top_number = 10)

Unnamed: 0,Screen Name,Value
0,disclosetv,3255
1,RWMaloneMD,2155
2,POTUS,2049
3,RepThomasMassie,1598
4,drsimonegold,1531
5,DrEliDavid,1483
6,Rob_Roos,1386
7,JackPosobiec,1318
8,MichaelPSenger,1260
9,EricTopol,1257


### URL

In [19]:
get_top(col = 'urls', top_of = 'expanded_url', sentiment = 1, top_number = 10)

Unnamed: 0,expanded_url,Value
0,https://phmpt.org/wp-content/uploads/2021/11/5...,235
1,https://childrenshealthdefense.org/defender/sh...,165
2,https://onlyfans.com/juanchox007,126
3,https://www.bitchute.com/video/1lR5MpxmefZE/,88
4,https://babylonbee.com/news/businesses-requiri...,86
5,https://petition.parliament.uk/petitions/602171,81
6,https://www.sirillp.com/wp-content/uploads/202...,80
7,https://babylonbee.com/news/psaki-says-everyon...,71
8,https://rairfoundation.com/alert-japan-places-...,69
9,https://trialsitenews.com/top-uk-cardiologist-...,56


### Hashtag

In [20]:
get_top(col = 'hashtags', top_of = 'text', sentiment = 1, top_number = 10)

Unnamed: 0,text,Value
0,COVID19,6149
1,vaccine,2849
2,Pfizer,2157
3,CovidVaccine,1006
4,vaccines,997
5,COVID,751
6,NoVaccineMandates,711
7,VaccineSideEffects,581
8,Omicron,578
9,vaccination,567


## Extracting Emojis

In [21]:
def extract_emojis(text):
    emojis = ''.join(c for c in text if c in emoji.EMOJI_DATA)
    return emojis


In [22]:
tweets_fc_pd['Emojis'] = tweets_fc_pd['text'].apply(extract_emojis)

tweets_fc_pd[tweets_fc_pd['Emojis'] != ""].sample(2)

Unnamed: 0,created_at,text,cleaned_text,entities,prediction,textblob,vader,score,sentiment,RT,...,urls_c,user_mentions_c,symbols_c,error,hashtags,urls,user_mentions,symbols,media,Emojis
160784,2022-01-08 02:01:24,@gatec1 @globeandmail Head start to send the v...,Head start to send the vaccine to the entire ...,"""{\""hashtags\"": [], \""urls\"": [], \""user_menti...",0.0,0.0,0.0,0.0,0,0,...,0,2,0,0,[],[],"[{'screen_name': 'gatec1', 'name': 'Lorac', 'i...",[],,😂
71150,2021-12-31 12:19:58,@pezzertron @MarzipanRug @BarSolara You really...,You really are getting desperate The vaccines...,"""{\""hashtags\"": [], \""urls\"": [], \""user_menti...",0.0,0.0333333333333333,0.0865,0.044937,1,0,...,0,3,0,0,[],[],"[{'screen_name': 'pezzertron', 'name': 'The Pe...",[],,🤷♂🤷♂


In [23]:
#456732
#436030
i = 436030

print(tweets_fc_pd['text'][i], '\n')
print(tweets_fc_pd['Emojis'][i], '\n')
print(f"Total emojis: {len(tweets_fc_pd['Emojis'][i])}")

RT @juanchox007: This tattooed daddy wanted to taste some of the vaccine and he came for his dose 💉💦💦
🔃&amp;❤ For More

⬇️⬇️⬇️⬇️FULL VIDEO⬇️⬇️… 

💉💦💦🔃❤⬇⬇⬇⬇⬇⬇ 

Total emojis: 11


# Time Aggregation

In [24]:
tweets_hour_pd = tweets_hour.toPandas()

                                                                                

In [25]:
dataset_scaled_EDA = tweets_hour_pd.copy()

# min max value calculation
dataset_scaled_EDA['min_hour'] = dataset_scaled_EDA.groupby(['hour','sentiment'])[['count_score']] \
                                    .transform(lambda x: x.min())
dataset_scaled_EDA['max_hour'] = dataset_scaled_EDA.groupby(['hour','sentiment'])[['count_score']] \
                                    .transform(lambda x: x.max())

# scale
dataset_scaled_EDA['hour_scaled'] = (dataset_scaled_EDA['count_score'] - dataset_scaled_EDA['min_hour'])/(dataset_scaled_EDA['max_hour'] - dataset_scaled_EDA['min_hour'])

# add info about year, week of year and day of week
dataset_scaled_EDA['day_of_week'] = [d.strftime('%A') for d in dataset_scaled_EDA['date']]
dataset_scaled_EDA['day_of_week'] = pd.Categorical(dataset_scaled_EDA['day_of_week'], 
  categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
  ordered=True)

#time with hour
dataset_scaled_EDA['datetime'] = pd.to_datetime(dataset_scaled_EDA['date'].astype(str) + ' ' + 
                                                pd.to_datetime(dataset_scaled_EDA['hour'], format='%H')\
                                                .dt.time.astype(str), format='%Y-%m-%d %H:%M:%S')

dataset_scaled_EDA['sentiment'] = dataset_scaled_EDA['sentiment'].apply(lambda x: 'Positive' if x == '1' else 'Negative')
dataset_scaled_EDA.head(10) 

Unnamed: 0,year,month,day,date,hour,sentiment,hourly_score,count_score,sum_RT,min_hour,max_hour,hour_scaled,day_of_week,datetime
0,2021,11,1,2021-11-01,0,Positive,0.216213,59,46,1,164,0.355828,Monday,2021-11-01 00:00:00
1,2021,11,1,2021-11-01,0,Negative,-0.113649,97,67,3,226,0.421525,Monday,2021-11-01 00:00:00
2,2021,11,1,2021-11-01,1,Positive,0.16842,69,48,5,173,0.380952,Monday,2021-11-01 01:00:00
3,2021,11,1,2021-11-01,1,Negative,-0.122941,86,62,5,238,0.347639,Monday,2021-11-01 01:00:00
4,2021,11,1,2021-11-01,2,Negative,-0.095417,80,56,2,175,0.450867,Monday,2021-11-01 02:00:00
5,2021,11,1,2021-11-01,2,Positive,0.19856,65,48,4,155,0.403974,Monday,2021-11-01 02:00:00
6,2021,11,1,2021-11-01,3,Negative,-0.112407,73,48,6,216,0.319048,Monday,2021-11-01 03:00:00
7,2021,11,1,2021-11-01,3,Positive,0.223755,60,44,4,137,0.421053,Monday,2021-11-01 03:00:00
8,2021,11,1,2021-11-01,4,Positive,0.214048,46,30,2,136,0.328358,Monday,2021-11-01 04:00:00
9,2021,11,1,2021-11-01,4,Negative,-0.147583,59,45,4,171,0.329341,Monday,2021-11-01 04:00:00


In [26]:
tweets_hour_pd_b = tweets_hour_b.toPandas()

# min max value calculation
tweets_hour_pd_b['min_hour'] = tweets_hour_pd_b.groupby(['hour'])[['count_score']] \
                                    .transform(lambda x: x.min())
tweets_hour_pd_b['max_hour'] = tweets_hour_pd_b.groupby(['hour'])[['count_score']] \
                                    .transform(lambda x: x.max())

# scale
tweets_hour_pd_b['hour_scaled'] = (tweets_hour_pd_b['count_score'] - tweets_hour_pd_b['min_hour'])/(tweets_hour_pd_b['max_hour'] - tweets_hour_pd_b['min_hour'])

# add info about year, week of year and day of week
tweets_hour_pd_b['day_of_week'] = [d.strftime('%A') for d in tweets_hour_pd_b['date']]
tweets_hour_pd_b['day_of_week'] = pd.Categorical(tweets_hour_pd_b['day_of_week'], 
  categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
  ordered=True)

#time with hour
tweets_hour_pd_b['datetime'] = pd.to_datetime(tweets_hour_pd_b['date'].astype(str) + ' ' + 
                                                pd.to_datetime(tweets_hour_pd_b['hour'], format='%H')\
                                                .dt.time.astype(str), format='%Y-%m-%d %H:%M:%S')

tweets_hour_pd_b['sentiment'] = tweets_hour_pd_b['hourly_score'].apply(lambda x: 'Positive' if x > 0 else 'Negative' if x < 0 else 'Neutral')

tweets_hour_pd_b.head(10) 



Unnamed: 0,year,month,day,date,hour,hourly_score,count_score,sum_RT,min_hour,max_hour,hour_scaled,day_of_week,datetime,sentiment
0,2021,11,1,2021-11-01,0,0.011106,156,113,4,335,0.459215,Monday,2021-11-01 00:00:00,Positive
1,2021,11,1,2021-11-01,1,0.006761,155,110,11,328,0.454259,Monday,2021-11-01 01:00:00,Positive
2,2021,11,1,2021-11-01,2,0.036365,145,104,7,295,0.479167,Monday,2021-11-01 02:00:00,Positive
3,2021,11,1,2021-11-01,3,0.039245,133,92,16,317,0.388704,Monday,2021-11-01 03:00:00,Positive
4,2021,11,1,2021-11-01,4,0.010846,105,75,9,250,0.39834,Monday,2021-11-01 04:00:00,Positive
5,2021,11,1,2021-11-01,5,-0.010645,84,61,6,207,0.38806,Monday,2021-11-01 05:00:00,Negative
6,2021,11,1,2021-11-01,6,0.001514,87,61,9,185,0.443182,Monday,2021-11-01 06:00:00,Positive
7,2021,11,1,2021-11-01,7,0.048603,77,58,8,217,0.330144,Monday,2021-11-01 07:00:00,Positive
8,2021,11,1,2021-11-01,8,0.042397,90,69,5,211,0.412621,Monday,2021-11-01 08:00:00,Positive
9,2021,11,1,2021-11-01,9,0.041829,87,67,4,209,0.404878,Monday,2021-11-01 09:00:00,Positive


# Dashboard

In [27]:
df = dataset_scaled_EDA.copy()

In [28]:
import pandas as pd
import dash
from jupyter_dash import JupyterDash
import dash_bootstrap_components as dbc
import dash_core_components as dcc
import dash_html_components as html
from dash_bootstrap_templates import load_figure_template

import numpy as np
import plotly.graph_objs as go
import plotly.express as px
from dash.dependencies import Input,Output
from dash import callback_context, Dash, html, dcc, Input, Output, dash_table

## Components

In [29]:
load_figure_template('minty')

In [30]:
title = dbc.Row(
    [
        dbc.Col(
                dbc.CardImg(src="/assets/images/CCT_Logo.jpeg", className="img-fluid rounded-start"),
            width={"size": 3},
        ),
        dbc.Col(
            [
                html.Div(
                    [
                        html.P("Twitter Analytics for", className="card-title mx-auto"),
                        html.H1("Vaccine", className="text-primary mx-auto"),
                    ]
                )
            ],
            width={"size": 3, "offset": 3},
            align="center",
        ),
    ]
)



In [31]:
user_mentions_count = len(get_top(col = 'user_mentions', top_of = 'screen_name'))

#CARD WITH ICONS -----------------------------------------------------------------------------------------------------
card_user_mentions = dbc.Card(
    dbc.CardBody(
        [
            html.H1('{:,.0f}'.format(user_mentions_count)),
            html.P([html.I(className="bi bi-people me-2"), " Users Tagged"], className="text-nowrap"),
            
        ], className="border-start border-success border-5"
    ),
    className="shadow my-2 text-center m-2",
    style={"maxWidth": 350},
)

#CARD WITH ICONS -----------------------------------------------------------------------------------------------------
cards_total_tweets = dbc.Card(
    dbc.CardBody(
        [
            html.H1('{:,.0f}'.format(total_tweets)),
            html.P([html.I(className="bi bi-twitter me-2"), " Tweets"], className="text-nowrap"),
            
        ], className="border-start border-success border-5"
    ),
    className="shadow my-2 text-center m-2",
    style={"maxWidth": 350},
)

#CARD WITH ICONS -----------------------------------------------------------------------------------------------------
cards_total_retweets = dbc.Card(
    dbc.CardBody(
        [
            html.H1('{:,.0f}'.format(tweets_hour_pd_b['sum_RT'].sum())),
            html.P([html.I(className="bi bi-repeat me-2"), " Retweets"], className="text-nowrap"),
            
        ], className="border-start border-success border-5"
    ),
    className="shadow my-2 text-center m-2",
    style={"maxWidth": 350},
)



In [32]:

#CARD WITH TIMELINE -------------------------------------------------------------------------------------------------
timeline = dbc.Card(
    dbc.CardBody(
        [
            dbc.CardHeader(
                html.H2("Timeline", className="text-primary text-start card-title-large align-middle"),
            ),
            
            dcc.Graph(
                id='timeline-plot',
                figure=px.line(dataset_scaled_EDA, 
                               x='date', 
                               y='count_score',
                               labels={"date": "Date", 
                                       "count_score": "Total of Tweets"},
                              ),style={'height': '350px'}
            ),
        ],
     ),
    className="shadow my-2 text-center m-2",
    style={'margin': '0 auto'},
    color="primary", 
    outline=True ,
)


In [33]:
from datetime import datetime, timedelta

tweets_fc_pd['created_at'] = pd.to_datetime(tweets_fc_pd['created_at'])
tweets_fc_pd['sentiment'] = tweets_fc_pd['sentiment'].astype(int)

#COUNT STATICS -------------------------------------------------------------------------------------------------
#avg_tweets_day = mean_tweets

total_7 = tweets_fc_pd[(tweets_fc_pd['created_at'] >= pd.to_datetime(tweets_fc_pd['created_at'].max()) - timedelta(days=7))]['score'].count()
total_30 = tweets_fc_pd[(tweets_fc_pd['created_at'] >= pd.to_datetime(tweets_fc_pd['created_at'].max()) - timedelta(days=30))]['score'].count()
total_60 = tweets_fc_pd[(tweets_fc_pd['created_at'] >= pd.to_datetime(tweets_fc_pd['created_at'].max()) - timedelta(days=60))]['score'].count()

mean_7 = tweets_fc_pd[(tweets_fc_pd['created_at'] >= pd.to_datetime(tweets_fc_pd['created_at'].max()) - timedelta(days=7))]['score'].count() / 7
mean_30 = tweets_fc_pd[(tweets_fc_pd['created_at'] >= pd.to_datetime(tweets_fc_pd['created_at'].max()) - timedelta(days=30))]['score'].count() / 30
mean_60 = tweets_fc_pd[(tweets_fc_pd['created_at'] >= pd.to_datetime(tweets_fc_pd['created_at'].max()) - timedelta(days=60))]['score'].count() / 60

text_7 = f'{total_7:,.0f} ({mean_7:,.2f} per day)'
text_30 = f'{total_30:,.0f} ({mean_30:,.2f} per day)'
text_60 = f'{total_60:,.0f} ({mean_60:,.2f} per day)'


#CARD WITH STATICS -------------------------------------------------------------------------------------------------
stat_data = dbc.CardBody([
    dbc.Row([dbc.Row(dbc.Col(html.H5("Av. Tweets: ", className="float-start"))),
             dbc.Row(dbc.Col(html.P(['{:,.0f}'.format(mean_tweets),' per day'], className="text-primary float-end")))
    ]),
    
    html.Div([
        dbc.Row(dbc.Row(dbc.Col(html.H5("Historical Data", className="text-center text-black w-100")))),

        dbc.Row([dbc.Row(dbc.Col(html.H6("Last 7 days: ", className="float-start m-1"))),
                 dbc.Row(dbc.Col(html.P(text_7, className="float-end text-primary")))        
        ]),

        dbc.Row([dbc.Row(dbc.Col(html.H6("Last 30 days: ", className="float-start m-1"))),
                 dbc.Row(dbc.Col(html.P(text_30, className="float-end text-primary")))        
        ]),

        dbc.Row([dbc.Row(dbc.Col(html.H6("Last 60 days: ", className="float-start m-1"))),
                 dbc.Row(dbc.Col(html.P(text_60, className="float-end text-primary")))        
        ]),
    ], className="bg-light bg-gradient border rounded-top")

 ],style={'height': '350px'})


#CARD WITH STATICS -------------------------------------------------------------------------------------------------
statics = dbc.Card(
    dbc.CardBody(
        [
            dbc.CardHeader(
            html.H2("Statistic", className="text-primary text-center card-title-large align-middle"),
            ),
            stat_data
        ]
     ),
    className="shadow my-2 m-2",
    style={'margin': '0 auto'},
    color="primary", 
    outline=True ,
)

In [34]:
max_score = tweets_hour_pd_b['hourly_score'].max()
min_score = tweets_hour_pd_b['hourly_score'].min()
min_date = tweets_hour_pd_b['datetime'].min()

graf_3_1 = dbc.Card(
            dbc.CardBody(
                [
                    dbc.CardHeader(
                            html.H2("Tweet Timeline - Sentiment",
                                    className="text-primary text-start card-title-large align-middle")
                    ),
                    dcc.Graph(
                        id='tweet-timeline-plot-2',
                        figure=px.scatter(
                            tweets_hour_pd_b,
                            x="datetime",
                            y="hourly_score",
                            size="count_score",
                            color="sentiment",
                            size_max=60,
                            labels={"datetime": "Date", "count_score": "Count Scaled"}
                        )\
                        # Y axes--------------------------------------------------------------------------------------
                        .update_yaxes(visible=False,
                                       showticklabels=False,
                                       zeroline=True
                        )\
                        # Max score Line------------------------------------------------------------------------------
                        .add_hline(y=max_score, 
                              line_width=0.1, 
                              #line_dash="dash", 
                              line_color="gray"
                        )\
                        # Min Score Line------------------------------------------------------------------------------
                        .add_hline(y=min_score, 
                              line_width=0.1, 
                              #line_dash="dash", 
                              line_color="gray"
                        )\
                        # Neutral Score Line--------------------------------------------------------------------------
                        .add_hline(y=0, 
                              line_width=0.1, 
                              #line_dash="dash", 
                              line_color="gray"
                        )\
                        # Max Score Annotation------------------------------------------------------------------------
                        .add_annotation(x= min_date, 
                               y=max_score,
                               text= f"<b> Most Positive </b> <br>{max_score:,.2f}",
                               #showarrow=True,
                               #arrowhead=2,
                               ax= -40,
                               ay= 0,
                               opacity=0.7,
                               #xshift=55,
                               arrowcolor="#41b6c4"
                        )\
                        # Min Score Annotation------------------------------------------------------------------------
                        .add_annotation(x= min_date, 
                               y=min_score,
                               text= f"<b> Most Negative </b> <br>{min_score:,.2f}",
                               #showarrow=True,
                               #arrowhead=2,
                               ax= -40,
                               ay= 0,
                               opacity=0.7,
                               #xshift=55,
                               arrowcolor="#41b6c4"
                        )\
                        # Neutral Score Annotation--------------------------------------------------------------------
                        .add_annotation(x= min_date, 
                               y=0,
                               text= f"<b> Neutral</b> <br> 0.00",
                               #showarrow=True,
                               #arrowhead=2,
                               ax= -40,
                               ay= 0,
                               opacity=0.7,
                               #xshift=55,
                               arrowcolor="#41b6c4"
                        )\
                        # Adjust Legend-------------------------------------------------------------------------------
                        .update_layout(
                                legend=dict(
                                    x=1,
                                    y=1,
                                    xanchor='right',
                                    yanchor='top'
                                )
                            ),

                    ),
                ]
            ),
    className="shadow my-2 text-center m-1",
    style={'margin': '0 auto'},
    color="primary", 
    outline=True ,
        )


In [35]:
perc_total = tweets_fc_pd['sentiment'].sum() / total_tweets
perc_pos_7 = tweets_fc_pd[(tweets_fc_pd['created_at'] >= pd.to_datetime(tweets_fc_pd['created_at'].max()) - timedelta(days=7))]['sentiment'].sum() / total_7
perc_pos_30 = tweets_fc_pd[(tweets_fc_pd['created_at'] >= pd.to_datetime(tweets_fc_pd['created_at'].max()) - timedelta(days=30))]['sentiment'].sum() / total_30
perc_pos_60 = tweets_fc_pd[(tweets_fc_pd['created_at'] >= pd.to_datetime(tweets_fc_pd['created_at'].max()) - timedelta(days=60))]['sentiment'].sum() / total_60
data = {
    'period' : ['7 days', '7 days', '30 days', '30 days', '60 days', '60 days', 'Total', 'Total'],
    'sentiment' : ['Positive', 'Negative','Positive', 'Negative','Positive', 'Negative','Positive', 'Negative'],
    'value': [perc_pos_7, (1-perc_pos_7), perc_pos_30, (1-perc_pos_30), perc_pos_60, (1-perc_pos_60), perc_total, (1-perc_total)]
}
df = pd.DataFrame(data)
colors = {'Positive': 'blue', 'Negative': 'rgba(128, 128, 128, 0.1)'}


graf_stat = dbc.Card(
            dbc.CardBody(
                [
                    dbc.CardHeader(
                            html.H2("Statistics",
                                    className="text-primary text-center card-title-large align-middle")
                    ),
                    dcc.Graph(
                        id='tweet-sentiment',
                        figure = px.bar(df, x='value', y='period', color='sentiment', orientation='h',
                                     color_discrete_map=colors,
                                     labels={'value': '', 'period': '', 'sentiment': ''})\
                        
                        # Adjust Legend-------------------------------------------------------------------------------
                        .update_layout(
                            showlegend=False, 
                            xaxis_showticklabels=False,
                            yaxis_showticklabels=False, 
                            xaxis_range=[0, 1],
                            #margin=dict(l=100, r=100, t=100, b=100), 
                            uniformtext_minsize=8, 
                            bargap=0.95,  
                            xaxis=dict(showticklabels=False, showgrid=False), 
                            yaxis=dict(showticklabels=False, showgrid=False)  
                        )\
                        
                        # Adjust Legend---------------------------------TOTAL----------------------------------------------
                        .add_annotation(x= 0.2, 
                               y='Total',
                               text= f"Total",
                               #showarrow=True,
                               #arrowhead=2,
                               ax= 0,
                               ay= 0,
                               opacity=1,
                               xshift=-50,
                               yshift=17,
                               arrowcolor="#41b6c4",
                               font=dict(size=24)
                    )\
                        
                        # Adjust Legend-------------------------------------------------------------------------------
                        .add_annotation(x= 1, 
                               y='Total',
                               text= f"{perc_total:,.2%}",
                               #showarrow=True,
                               #arrowhead=6,
                               ax= 10,
                               ay= -13,
                               opacity=1,
                               #xshift=0,
                               #yshift=-17,
                               #arrowcolor="#333333",
                               font=dict(size=16)
                    )\
                        
                        # Adjust Legend---------------------------------7 DAYS----------------------------------------------
                        .add_annotation(x= 0.2, 
                               y='7 days',
                               text= f"7 Days",
                               #showarrow=True,
                               #arrowhead=2,
                               ax= 0,
                               ay= 0,
                               opacity=1,
                               xshift=-60,
                               yshift=17,
                               arrowcolor="#41b6c4",
                               font=dict(size=24)
                    )\
                        
                        # Adjust Legend-------------------------------------------------------------------------------
                        .add_annotation(x= 1, 
                               y='7 days',
                               text= f"{perc_pos_7:,.2%}",
                               #showarrow=True,
                               #arrowhead=6,
                               ax= 10,
                               ay= -13,
                               opacity=1,
                               #xshift=0,
                               #yshift=-17,
                               #arrowcolor="#333333",
                               font=dict(size=16)
                    )\
                        
                        # Adjust Legend----------------------------------30 DAYS---------------------------------------------
                        .add_annotation(x= 0.2, 
                               y='30 days',
                               text= "30 Days",
                               #showarrow=True,
                               #arrowhead=2,
                               ax= 0,
                               ay= 0,
                               opacity=1,
                               xshift=-65,
                               yshift=17,
                               arrowcolor="#41b6c4",
                               font=dict(size=24)
                    )\
                        
                        # Adjust Legend-------------------------------------------------------------------------------
                        .add_annotation(x= 1, 
                               y='30 days',
                               text= f"{perc_pos_30:,.2%}",
                               #showarrow=True,
                               #arrowhead=6,
                               ax= 10,
                               ay= -13,
                               opacity=1,
                               #xshift=0,
                               #yshift=-17,
                               #arrowcolor="#333333",
                               font=dict(size=16)
                    )\
                        
                        # Adjust Legend-----------------------------------60 DAYS--------------------------------------------
                        .add_annotation(x= 0.2, 
                               y='60 days',
                               text= "60 Days",
                               #showarrow=True,
                               #arrowhead=2,
                               ax= 0,
                               ay= 0,
                               opacity=1,
                               xshift=-65,
                               yshift=17,
                               arrowcolor="#41b6c4",
                               font=dict(size=24)
                    )\
                        
                        # Adjust Legend-------------------------------------------------------------------------------
                        .add_annotation(x= 1, 
                               y='60 days',
                               text= f"{perc_pos_60:,.2%}",
                               #showarrow=True,
                               #arrowhead=6,
                               ax= 10,
                               ay= -13,
                               opacity=1,
                               #xshift=0,
                               #yshift=-17,
                               #arrowcolor="#333333",
                               font=dict(size=16)
                    )
                      
                      
                      
                        
                        
                        
                        
                        ,style={'width': '100%', 'height': '100%'}
                )
                ]
            ),className="shadow my-2 text-center m-1",
            style={'margin': '0 auto'}
)

In [77]:
def get_emoji(sentiment, top = None):
    emojis_list = []
    for r in tweets_fc_pd[(tweets_fc_pd['Emojis'] != "") & (tweets_fc_pd['sentiment'] == sentiment)]['Emojis']:
        for i in r:
            emojis_list.append(i)
            
    if top == None:
        return pd.DataFrame(Counter(emojis_list), columns=['Emoji', 'Frequency'])
    else:
        return pd.DataFrame(Counter(emojis_list).most_common(top), columns=['Emoji', 'Frequency']) 

In [78]:
emojis_pos = get_emoji(sentiment = 1, top = 20)
emojis_neg = get_emoji(sentiment = 0, top = 20)

In [84]:
emojis_pos.sample(1)

Unnamed: 0,Emoji,Frequency
0,🚨,4077


In [125]:
user_tables_pos = get_top(col = 'user_mentions', top_of = 'screen_name', top_of_rename = 'Screen Name', sentiment = 1, top_number = 10)
user_tables_neg = get_top(col = 'user_mentions', top_of = 'screen_name', top_of_rename = 'Screen Name', sentiment = 0, top_number = 10)

urls_tables_pos = get_top(col = 'urls', top_of = 'url', top_of_rename = 'URL', sentiment = 1, top_number = 10)
urls_tables_neg = get_top(col = 'urls', top_of = 'url', top_of_rename = 'URL', sentiment = 0, top_number = 10)

users_table = dash_table.DataTable(
    id='user-table',
    columns=[{'id': c, 'name': c} for c in user_tables_pos.columns],
    page_action='none',
    style_table={'height': '100%', 'overflowY': 'auto'},
    data=user_tables_pos.to_dict('records'),
)


urls_table = dash_table.DataTable(
    id='urls-table',
    columns=[{'id': c, 'name': c} for c in urls_tables_pos.columns],
    page_action='none',
    style_table={'height': '100%', 'overflowY': 'auto'},
    data=urls_tables_pos.to_dict('records'),
)

emoji = dcc.Graph(id='emoji-graph',
                  figure = px.bar(emojis_pos, y='Emoji', x='Frequency', orientation='h', )\
                 .update_layout(yaxis_tickfont=dict(size=12),
                               yaxis=dict(autorange="reversed"),
                               ))



tabs = dbc.Card(
    [
        dbc.CardHeader(
            
#-----------TABS -----------------------------------------------------------------------------TABS-------------------
            dbc.Tabs(
                [
                    dbc.Tab(label="Positive Sentimet", tab_id="tab-1"),
                    dbc.Tab(label="Negative Sentiment", tab_id="tab-2"),
                ],
                id="card-tabs",
                active_tab="tab-1",
            )
        ),

#-----------ROW 1---------------------------------------------------------------------------------ROW 1-------------------

        dbc.Row([
#-----------TABLE USER---------------------------------------------------------------------------TABLE USER-------------------
            dbc.Col(
                dbc.Card(
                    dbc.CardBody([                
                    dbc.CardHeader(
                            html.H4("Top 10 Tagged User",
                                    className="text-primary text-center card-title-large align-middle")
                    ), 
                    users_table
                    ])
                ), className="m-1 w-50"),         
#-----------TABLE URL---------------------------------------------------------------------------TABLE URL-------------------
            dbc.Col(
                dbc.Card(
                    dbc.CardBody([                
                    dbc.CardHeader(
                            html.H4("Top 10 URL Shared",
                                    className="text-primary text-center card-title-large align-middle")
                    ), 
                    urls_table
                    ])
                ),className="m-1 w-50"), 
        ],className="m-1"),
        
#-----------ROW 2---------------------------------------------------------------------------------ROW 2-------------------

        dbc.Row([
#-----------Hashtags-----------------------------------------------------------------------------Hashtags------------------
            dbc.Col(
                dbc.Card(
                    dbc.CardBody([                
                    dbc.CardHeader(
                            html.H4("Word Cloud Hashtags",
                                    className="text-primary text-center card-title-large align-middle")
                    ), 
                    dbc.CardImg(id='cloud-hashtags',src='assets/images/hashtags_pos.png',
                                top=True,
                                className="m-1",
                                style={'height': '443px'})
                    ])
                ), className="m-1 w-50"),         
#-----------Emoji--------------------------------------------------------------------------------Emoji-------------------
            dbc.Col(
                dbc.Card(
                    dbc.CardBody([                
                    dbc.CardHeader(
                            html.H4("Emoji - Frequency",
                                    className="text-primary text-center card-title-large align-middle")
                    ), 
                    emoji
                    ])
                ),className="m-1 w-50", style={'height': '350px'}), 
        ],className="m-1") # fecha o row        
               
    ],
    className="shadow my-2 text-center m-1",
    style={'margin': '0 auto'},
)


## App

In [128]:
app = JupyterDash(external_stylesheets=[dbc.themes.MINTY, dbc.icons.BOOTSTRAP], )

    
app.layout = dbc.Container([
    
#TITLE---------------------------------------------------------------------------------------------------------------
    dbc.Row([
        dbc.Col(title, width = 12),
    ],
    ),
    
#ROW 1----------------------------------------------------------------------------------------------------------------
    dbc.Row([
        dbc.Col(cards_total_tweets),
        dbc.Col(cards_total_retweets), 
        dbc.Col(card_user_mentions)
    ],style={'margin': '0 auto'}
    ),


#ROW 2----------------------------------------------------------------------------------------------------------------
    dbc.Row([
        dbc.Col(timeline, width = 8),
        dbc.Col(statics, width = 4),
    ],style={'margin': '0 auto'}
    ),
    
#ROW 3----------------------------------------------------------------------------------------------------------------
    dbc.Row([
        dbc.Col(graf_3_1, width = 8),
        dbc.Col(graf_stat, width = 4),
    ],style={'margin': '0 auto'}
    ),
    
#ROW 4----------------------------------------------------------------------------------------------------------------

       dbc.Row([
        dbc.Col(tabs, width = 12),
    ],style={'margin': '0 auto'}
    ),
    

#END------------------------------------------------------------------------------------------------------------------
])


#CALLBACK-------------------------------------------------------------------------------------------------------------
@app.callback(
    Output('user-table', 'data'),
    Output('urls-table', 'data'),  
    Output('cloud-hashtags', 'src'),  
    Output('emoji-graph', 'figure'),  
    Input('card-tabs', 'active_tab')
)

#FUNCTIONS------------------------------------------------------------------------------------------------------------

def update_tables(active_tab):
    if active_tab == 'tab-1':
        user_table_data = user_tables_pos.to_dict('records')
        urls_table_data = urls_tables_pos.to_dict('records')
        src_hashtag = 'assets/images/hashtags_pos.png'
        fig_emoji = px.bar(emojis_pos, y='Emoji', x='Frequency', orientation='h')
        fig_emoji.update_layout(yaxis_tickfont=dict(size=12), yaxis=dict(autorange="reversed"))
        
    else:
        user_table_data = user_tables_neg.to_dict('records')
        urls_table_data = urls_tables_neg.to_dict('records')
        src_hashtag = 'assets/images/hashtags_neg.png'
        fig_emoji = px.bar(emojis_neg, y='Emoji', x='Frequency', orientation='h')
        fig_emoji.update_layout(yaxis_tickfont=dict(size=12), yaxis=dict(autorange="reversed"))

    return user_table_data, urls_table_data, src_hashtag, fig_emoji
            
if __name__ == "__main__":
    app.run_server(debug=True)#mode='inline')

Dash is running on http://127.0.0.1:8050/

Dash app running on http://127.0.0.1:8050/
