In [1]:
#Spark
from pyspark.sql import SparkSession
from pyspark.sql.window import Window
from pyspark import SparkConf

#Hadoop
from hdfs import InsecureClient

#Spark SQL functions
from pyspark.sql.functions import *
from pyspark.sql.functions import from_utc_timestamp, udf, array_distinct, col, when
from pyspark.sql.functions import regexp_replace, year, month, dayofmonth, hour, format_string
from pyspark.sql.functions import monotonically_increasing_id

# Spark Datatypes
from pyspark.sql.types import StringType, TimestampType, DateType, IntegerType
from pyspark.sql.types import DoubleType, StructType, FloatType, StructField


#Pandas
import pandas as pd


## Warnings conf

In this sections the warnings are suppressed, less logs while running the code

In [2]:
import warnings

# Suppressing the warnings
warnings.filterwarnings('ignore') 


# Data Loading

## Reading Data from Hadoop

In this section we are reading data from Hadoop, using a Spark Session.

### Spark configurations and Connection

In [3]:
def spark_start(restart = False, appname="HadoopAccess"):

    if restart == True:
        spark.stop()
    
    # Configuration parameters for Spark
    spark_conf = SparkConf().setMaster("local[*]").setAppName(appname)

    # Using SparkSession
    spark = SparkSession.builder.config(conf=spark_conf).config('spark.sql.session.timeZone', 'UTC').getOrCreate()
    
    return spark


spark = spark_start()

 # this will help not to have too much error displaying
sc = spark.sparkContext
sc.setLogLevel('ERROR')


2023-05-26 11:24:10,662 WARN util.Utils: Your hostname, BDS-2023 resolves to a loopback address: 127.0.1.1; using 192.168.0.122 instead (on interface wlo1)
2023-05-26 11:24:10,663 WARN util.Utils: Set SPARK_LOCAL_IP if you need to bind to another address
Setting default log level to "WARN".
To adjust logging level use sc.setLogLevel(newLevel). For SparkR, use setLogLevel(newLevel).
2023-05-26 11:24:11,409 WARN util.NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable


# Creating Models in Spark

In [4]:
#Reading all parquets file in the folder tweets on Hadoop
tweets_pred_saved = spark.read.parquet("/CA4/sentiment/**/*.parquet")

                                                                                

## Binary Sentiment

### creating a binary sentiment

The binary sentiment is being created using the score to define whether it is negative(0) or positive(1).

In [5]:
#getting sentiment
tweets_fc_df = tweets_pred_saved.withColumn('sentiment',when(col("score") > 0, '1').otherwise('0'))

In [6]:
tweets_fc_df.printSchema()

root
 |-- created_at: timestamp (nullable = true)
 |-- text: string (nullable = true)
 |-- cleaned_text: string (nullable = true)
 |-- entities: string (nullable = true)
 |-- prediction: double (nullable = true)
 |-- textblob: string (nullable = true)
 |-- vader: string (nullable = true)
 |-- score: double (nullable = true)
 |-- sentiment: string (nullable = false)



In [7]:
tweets_fc = tweets_fc_df.withColumn("RT", when(col("text").startswith("RT"), 1).otherwise(0))

In [8]:
spark.conf.set("spark.sql.legacy.timeParserPolicy","LEGACY")

from pyspark.sql.functions import year, month, dayofmonth, hour, col, mean, count, to_date
from pyspark.sql.functions import sum as spark_sum


tweets_hour = tweets_fc.groupBy(year("created_at").alias("year"), 
                               month("created_at").alias("month"),
                               dayofmonth("created_at").alias("day"),
                               to_date(col("created_at")).alias("date"),
                               hour("created_at").alias("hour"),
                               "sentiment") \
                      .agg(mean(col("score")).alias("hourly_score"),
                           count(col("score")).alias("count_score"),
                           spark_sum(col("RT")).alias("sum_RT"))\
                      .orderBy("year", "month", "day", "hour")
tweets_hour.printSchema()

tweets_hour.show(2)

root
 |-- year: integer (nullable = true)
 |-- month: integer (nullable = true)
 |-- day: integer (nullable = true)
 |-- date: date (nullable = true)
 |-- hour: integer (nullable = true)
 |-- sentiment: string (nullable = false)
 |-- hourly_score: double (nullable = true)
 |-- count_score: long (nullable = false)
 |-- sum_RT: long (nullable = true)



[Stage 2:>                                                          (0 + 8) / 8]

+----+-----+---+----------+----+---------+--------------------+-----------+------+
|year|month|day|      date|hour|sentiment|        hourly_score|count_score|sum_RT|
+----+-----+---+----------+----+---------+--------------------+-----------+------+
|2021|   11|  1|2021-11-01|   0|        0|-0.11364917380729236|         97|    67|
|2021|   11|  1|2021-11-01|   0|        1|   0.216212583461736|         59|    46|
+----+-----+---+----------+----+---------+--------------------+-----------+------+
only showing top 2 rows



                                                                                

In [9]:
total_tweets = tweets_fc.count()
print(f"Total of Tweets:{total_tweets:,}")

days = tweets_fc.select(to_date(col("created_at")).alias("data")).agg(countDistinct("data").alias("total_dias")).first()["total_dias"]
mean_tweets = total_tweets / days
print(f"Average of {mean_tweets:,.0f} tweets per day ({days} total)")


Total of Tweets:763,266
Average of 2,091 tweets per day (365 total)


In [10]:
tweets_hour_pd = tweets_hour.toPandas()

                                                                                

In [11]:
dataset_scaled_EDA = tweets_hour_pd.copy()

# min max value calculation
dataset_scaled_EDA['min_hour'] = dataset_scaled_EDA.groupby(['hour','sentiment'])[['count_score']] \
                                    .transform(lambda x: x.min())
dataset_scaled_EDA['max_hour'] = dataset_scaled_EDA.groupby(['hour','sentiment'])[['count_score']] \
                                    .transform(lambda x: x.max())

# scale
dataset_scaled_EDA['hour_scaled'] = (dataset_scaled_EDA['count_score'] - dataset_scaled_EDA['min_hour'])/(dataset_scaled_EDA['max_hour'] - dataset_scaled_EDA['min_hour'])

# add info about year, week of year and day of week
dataset_scaled_EDA['day_of_week'] = [d.strftime('%A') for d in dataset_scaled_EDA['date']]
dataset_scaled_EDA['day_of_week'] = pd.Categorical(dataset_scaled_EDA['day_of_week'], 
  categories=['Monday', 'Tuesday', 'Wednesday', 'Thursday', 'Friday', 'Saturday', 'Sunday'], 
  ordered=True)

dataset_scaled_EDA['sentiment'] = dataset_scaled_EDA['sentiment'].apply(lambda x: 'Positive' if x == '1' else 'Negative')
dataset_scaled_EDA.head(10) 

Unnamed: 0,year,month,day,date,hour,sentiment,hourly_score,count_score,sum_RT,min_hour,max_hour,hour_scaled,day_of_week
0,2021,11,1,2021-11-01,0,Positive,0.216213,59,46,1,164,0.355828,Monday
1,2021,11,1,2021-11-01,0,Negative,-0.113649,97,67,3,226,0.421525,Monday
2,2021,11,1,2021-11-01,1,Positive,0.16842,69,48,5,173,0.380952,Monday
3,2021,11,1,2021-11-01,1,Negative,-0.122941,86,62,5,238,0.347639,Monday
4,2021,11,1,2021-11-01,2,Negative,-0.095417,80,56,2,175,0.450867,Monday
5,2021,11,1,2021-11-01,2,Positive,0.19856,65,48,4,155,0.403974,Monday
6,2021,11,1,2021-11-01,3,Negative,-0.112407,73,48,6,216,0.319048,Monday
7,2021,11,1,2021-11-01,3,Positive,0.223755,60,44,4,137,0.421053,Monday
8,2021,11,1,2021-11-01,4,Positive,0.214048,46,30,2,136,0.328358,Monday
9,2021,11,1,2021-11-01,4,Negative,-0.147583,59,45,4,171,0.329341,Monday


In [12]:
df = dataset_scaled_EDA.copy()

In [55]:
import pandas as pd
import dash
from jupyter_dash import JupyterDash
import dash_bootstrap_components as dbc
import dash_core_components as dcc
import dash_html_components as html
from dash_bootstrap_templates import load_figure_template

import numpy as np
import plotly.graph_objs as go
import plotly.express as px
from dash.dependencies import Input,Output
from dash import callback_context


In [56]:
load_figure_template('sketchy')

In [84]:
#Size for the balls
size_scale = 10
df['ball_size'] = df['hourly_score'].abs() * size_scale

In [57]:
fig1 = px.scatter(tweets_hour_pd, x= 'date', y = 'count_score', color = 'sentiment')

SIDEBAR_STYLE = {
    "position": "fixed",
    "top": 0,
    "left": 0,
    "bottom": 0,
    "width": "24rem",
    "padding": "2rem 1rem",
    "background-color": "#f8f9fa",
}


sidebar = html.Div(
    [
        html.H2("Filters"),
        html.Hr(),
        html.P(
            "A simple sidebar layout with filters", className="lead"
        ),
        dbc.Nav(
            [
                dcc.Dropdown(id = 'one'),
                html.Br(),

            ],
            vertical=True,
            pills=True,
        ),
    ],
    style=SIDEBAR_STYLE,

)

In [58]:
def Plot_box(df, y, x, color, title, xaxis_title_text, yaxis_title_text, legend_title_text):
    fig = px.box(df,
                 y= y, 
                 x= x, 
                 orientation='v', 
                 color= color,
                 notched=True)

    fig.update_layout(
        showlegend=True,
        title_text=title,
        title_font_color='#333333',
        #width = 1200,
        #height=500,
        plot_bgcolor='white',
        xaxis_title_font_color='grey',
        yaxis_title_font_color='grey',
        yaxis_color='grey',
        xaxis_color='grey',
        xaxis_title_text = xaxis_title_text,
        yaxis_title_text = yaxis_title_text,
        xaxis_gridcolor = '#F7FCF0',
        xaxis_linecolor = '#F7FCF0',
        coloraxis_showscale=False,
        legend_title_text= legend_title_text)

    fig.update_xaxes(categoryorder='total descending')
    
    return fig


def generate_table(dataframe, max_rows=10):
    limited_dataframe = dataframe.head(max_rows)
    
    return html.Table([
        html.Thead(
            html.Tr([html.Th(col) for col in limited_dataframe.columns])
        ),
        html.Tbody([
            html.Tr([
                html.Td(limited_dataframe.iloc[i][col]) for col in limited_dataframe.columns
            ]) for i in range(len(limited_dataframe))
        ])
    ])

In [59]:
card_inicial = dbc.Card(
    [
        dbc.Row(
            [
                 dbc.Col(
                    dbc.CardImg(
                        src="https://www.cct.ie/wp-content/uploads/CCT_Logo_New_Aug_17-2.jpg",
                        className="img-fluid rounded-start",
                    ),
                    className="col-md-4",
                ),
                
                dbc.Col(
                    dbc.CardBody(
                        [
                            html.P("Twitter Analytics for", className="card-title mx-auto"),
                            html.H1("Vaccine", className="text-primary mx-auto"),
                        ]
                    ),
                    className="col-md-8 mx-auto",
                ),
            ],
            className="g-0 d-flex align-items-center",
        )
    ],
    className="mb-3",
    style={'width': '95%', 'margin': '0 auto'},
)


In [60]:
#total_tweets, days, mean_tweets

cards_total_tweets = dbc.Card(
    dbc.CardBody(
        [
            html.P('Total of Tweets', className="card-text text-center align-middle"),
            html.H1('{:,.0f}'.format(total_tweets), className="text-primary text-center card-title-large align-middle"),
        ]
    ),
    style={"width": "100%"},
)

cards_days = dbc.Card(
    dbc.CardBody(
        [
            html.P('Total of Days', className="card-text text-center align-middle"),
            html.H1(days, className="text-primary text-center card-title-large align-middle"),
        ]
    ),
    style={"width": "100%"},
)

cards_mean_tweets = dbc.Card(
    dbc.CardBody(
        [
            html.P('Tweets per Day', className="card-text text-center align-middle"),
            html.H1('{:,.2f}'.format(mean_tweets), className="text-primary text-center card-title-large align-middle"),
        ]
    ),
    style={"width": "100%"},
)

row_1 = dbc.Row(
    [
        dbc.Col(dbc.Card(cards_total_tweets, color="primary", outline=True)),
        dbc.Col(dbc.Card(cards_days, color="primary", outline=True)),
        dbc.Col(dbc.Card(cards_mean_tweets, color="primary", outline=True)),
    ],
    className="mb-4",
    justify="evenly",
    style={'width': '97%', 'margin': '0 auto'},
)


In [79]:
row_2 = dbc.Card(
    dbc.CardBody(
        [
            dbc.CardHeader([
            html.H2("Tweet Timeline", className="text-primary text-start card-title-large align-middle"),
            ]),
            dcc.Graph(
                id='tweet-timeline-plot',
                figure=px.line(df, 
                               x='date', 
                               y='count_score',
                               labels={"date": "Date", 
                                       "hour_scaled": "Count"},
                              )
            )
        ]
     ),
    className="mb-4",
    style={'width': '95%', 'margin': '0 auto'},
    color="primary", 
    outline=True ,
)

In [94]:
graf_3_1 = dbc.Card(
    dbc.CardBody(
        [
            dbc.CardHeader(
                [
                    html.H2("Tweet Timeline - Sentiment", className="text-primary text-start card-title-large align-middle")
                ]
            ),
            dcc.Graph(
                id='tweet-timeline-plot-2',
                figure=px.scatter(
                    df,
                    x="date",
                    y="count_score",
                    size="ball_size",
                    color="sentiment",
                    size_max=60,
                    labels={"date": "Date", "count_score": "Count Scaled"}
                )
            )
        ]
    ),
    className="mb-4",
    style={'width': '95%', 'margin': '0 auto'},
    color="primary",
    outline=True
)


In [96]:
from dash import Dash, html, dcc, Input, Output
import pandas as pd
import plotly.express as px

app = JupyterDash(external_stylesheets=[dbc.themes.SKETCHY, dbc.icons.BOOTSTRAP], )

    
app.layout = html.Div([
 
    html.Div([
        
        #Card 
        card_inicial,
        
        #Row_1
        row_1,        
        
        row_2,
        
        graf_3_1,

    ]),

    

])
if __name__ == "__main__":
    app.run_server()#mode='inline')

Dash is running on http://127.0.0.1:8050/

Dash app running on http://127.0.0.1:8050/


### 