In [2]:
import pandas as pd
import numpy as np
import plotly.graph_objects as go
import plotly.express as px

import seaborn as sns

import dash
import dash_core_components as dcc
import dash_html_components as html
from dash.dependencies import Input, Output

TOP_K = 15
ASSETS_PATH="assets"
# pd.set_option('display.max_rows', None)
# pd.set_option('display.max_columns', None)
# pd.set_option('display.width', None)
# pd.set_option('display.max_colwidth', -1)

In [3]:
dataset = pd.read_csv("./data/Emails.csv")
df = pd.DataFrame(dataset)

df.rename(columns={"MetadataDateSent": "DateSent"}, inplace=True)
df = pd.DataFrame(df[["SenderPersonId","DateSent"]])
df.dropna(inplace = True)

df["DateSent"] = df["DateSent"].astype(str)
df["DateSent"]= df["DateSent"].str.slice(0, 10, 1)
df["DateSent"] = pd.to_datetime(df["DateSent"])

df["SenderPersonId"] = df["SenderPersonId"].astype(str)
df["SenderPersonId"] = df["SenderPersonId"].str.slice(0, -2, 1)

print("First email was on:", df.DateSent.min())
print("Last email was on:", df.DateSent.max())
print("The dataset covers", df.DateSent.max() - df.DateSent.min(), "days")
print("The most active day was",df["DateSent"].value_counts().index[0],
      "with",df["DateSent"].value_counts()[0],"emails")

First email was on: 2008-05-01 00:00:00
Last email was on: 2014-12-14 00:00:00
The dataset covers 2418 days 00:00:00 days
The most active day was 2009-12-23 00:00:00 with 47 emails


In [4]:
# Assegnare ID in persons.csv a dataframe principale, così da poter tradurre gli IDs in nomi
persons = pd.read_csv("./data/Persons.csv")
persons = persons.to_dict()
persons = persons["Name"]
df["SenderFullName"] = None

i = 0
for cell,i in zip(df["SenderPersonId"].items(),range(0,len(df))):
    for key in persons.keys():
        if cell[1] == str(key):
            df["SenderFullName"].iloc[i] = persons[key-1]
            i += 1

A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  self._setitem_single_block(indexer, value, name)


In [5]:
i = 0
x = 0
num = 490
while i < 10:
    for date in df['DateSent'].value_counts(normalize=True)[:num]:
        x += df['DateSent'].value_counts(normalize=True)[i]
        i += 1
print(f"In {num} giorni, cioè il {round(num/int(df['DateSent'].count()),2)*100}% del totale dei "
      f"giorni, si è concentrato il {round(x,2)*100}% degli scambi di mail")

In 490 giorni, cioè il 6.0% del totale dei giorni, si è concentrato il 90.0% degli scambi di mail


In [29]:
top_people = df['SenderFullName'].value_counts(normalize=True)[:TOP_K]
cum_sum = np.cumsum(top_people)

chart2 = go.Figure()

chart2.add_trace(go.Bar(
    x=cum_sum.index[:TOP_K],
    y=top_people[:TOP_K],
    name="Markers and Text",
    marker_color="#636EFA",
    showlegend=False
))

chart2.add_trace(go.Scatter(
    x=cum_sum.index[:TOP_K],
    y=cum_sum,
    mode="text+lines+markers",
    #name = "Cumulata",
    text=[i for i in range(1, TOP_K + 1)],
    textposition="top center",
    showlegend=False
))

chart2.update_xaxes(
    tickangle = 45
)

chart2.update_yaxes(
    title_text = "Probabilità dell'attività",
)

chart2.update_layout(
    title=dict(
        text=f'Attività dei contatti cumulata',
        xanchor='center',
        x=0.5, y=.85),
    yaxis = dict(
        tickmode = 'linear',
        tick0 = 0,
        dtick = 0.1,
    )
)
chart2.write_image("assets/cumulative_activity.png", scale=4)
chart2.show()

In [23]:
chart3 = px.treemap(df, path=['SenderFullName'],
                    height = 500,
                    title = "Origine delle mail in percentuale")

chart3.update_layout(
    title=dict(
        x=0.5,
        y=0.8,
        xanchor='center')
)

chart3.update_traces(textinfo = 'label + percent parent')
chart3.write_image("assets/activity.png", scale=4)
chart3.show()

In [7]:
# Trovare i 5 giorni più intensi per le prime 10 persone più attive
names_group = df.groupby(["SenderFullName"])
# for name in su1:
#     print(name)
#     print(names_group["DateSent"].value_counts().loc[name][:5],"\n")
    
# p = df.groupby('SenderFullName').DateSent.value_counts(normalize=True)

In [8]:
n_of_emails = len(df)
n_of_people = len(df["SenderFullName"].unique())

def generate_hisplot_expl(dataframe):
    i = 0
    x = 0
    while i < len(dataframe.value_counts(normalize=False)):
        for line in dataframe.value_counts(normalize=False):
            x += dataframe.value_counts(normalize=False)[i]
            i += 1
    perc_p = round(len(dataframe)/n_of_people,4)*100
    perc_e = round(x/n_of_emails,4)*100
    
    return html.Div([
        dcc.Graph(
            id='displot-people-date',
            figure = px.histogram(dataframe, x="DateSent", marginal="rug", color="SenderFullName")),
        html.Center(html.H4(
                   children = [f"I {len(dataframe.SenderFullName.unique())} contatti scelti, cioè il "+
                               f"{round(len(dataframe.SenderFullName.unique())/n_of_people*100,2)}% del totale,"+
      f" sono responsabili per il {round(x/n_of_emails*100,2)}% del flusso totale di mail"]
        )
    )
    ])
# https://wallpapercave.com/wp/wp2038942.jpg
# https://www.wallpapers4u.org/wp-content/uploads/light_blue_rays_shine_9824_1920x1080.jpg

external_stylesheets = ['https://codepen.io/chriddyp/pen/bWLwgP.css']

app = dash.Dash(__name__, external_stylesheets=external_stylesheets)
app.layout = html.Div(
    children=[
        html.Div(
            style={'background-image': "url(https://static.vecteezy.com/system/resources/previews/000/072/938/original/rays-background-vector.jpg)", 
                   'background-repeat' : "no-repeat",
                   'background-size' : 'cover',
                   'margin-right' : "auto",
                   'margin-left' : 'auto',
                   'opacity' : 1,
                   'float' : 'none'
                    }, 
            children=[
                html.Center(
                    html.H1(html.I("Hillary Clinton's Leaked Emails: un'analisi"))
                )
            ]
        ),

    dcc.Tabs([
        
        
        dcc.Tab(label='Analisi Esplorativa', children=[
            html.Center(html.H2(children='Attività dei contatti')),
            html.Br(),
            html.Center(html.H4(children='Nei grafici sottostanti sono esposte le attività dei contatti presenti '+ 
                'nel dataset. Come è possibile notare, le attività cominciano il 2008-05-01 e finiscono '+
                'il 2014-12-14, per una copertura totale di 2418 giorni, ovvero più di 6 anni.')),
            dcc.Dropdown(
                id='dropdown',
                options=[{'label': i, 'value': i} for i in top_people.index],
                multi=True, placeholder='Selezionare i contatti desiderati... (Ordinati per utenti più attivi)'),
            html.Div(id='plot-container'),
            html.Br(),
            html.Div(
                children=[
                    dcc.Graph(figure=chart3),
                    dcc.Graph(figure=chart2)
            ]
        
        )]),
        
        
        dcc.Tab(label='Sentiment Analysis', children=[
            dcc.Graph(
                figure={
                    'data': [
                        {'x': [1, 2, 3], 'y': [1, 4, 1],
                            'type': 'bar', 'name': 'SF'},
                        {'x': [1, 2, 3], 'y': [1, 2, 3],
                         'type': 'bar', 'name': u'Montréal'},
                    ]
                }
            )
        ]),
        
        
        ])])

@app.callback(dash.dependencies.Output('plot-container', 'children'),
    [dash.dependencies.Input('dropdown', 'value')])

def display_hisplot_expl(dropdown_value):
    if dropdown_value is None:
        return generate_hisplot_expl(df)
    dff = df[df.SenderFullName.str.contains('|'.join(dropdown_value))]
    return generate_hisplot_expl(dff)

if __name__ == '__main__':
    app.run_server(debug=False, port=1346)

Dash is running on http://127.0.0.1:1346/

 * Serving Flask app "__main__" (lazy loading)
 * Environment: production
[2m   Use a production WSGI server instead.[0m
 * Debug mode: off


 * Running on http://127.0.0.1:1346/ (Press CTRL+C to quit)
127.0.0.1 - - [19/Jun/2021 17:25:45] "[37mGET / HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Jun/2021 17:25:45] "[37mGET /_dash-component-suites/dash_renderer/polyfill@7.v1_9_1m1617900226.8.7.min.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Jun/2021 17:25:45] "[37mGET /_dash-component-suites/dash_renderer/react@16.v1_9_1m1617900226.14.0.min.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Jun/2021 17:25:45] "[37mGET /_dash-component-suites/dash_renderer/prop-types@15.v1_9_1m1617900226.7.2.min.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Jun/2021 17:25:45] "[37mGET /_dash-component-suites/dash_core_components/dash_core_components-shared.v1_16_0m1617909890.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Jun/2021 17:25:45] "[37mGET /_dash-component-suites/dash_renderer/react-dom@16.v1_9_1m1617900226.14.0.min.js HTTP/1.1[0m" 200 -
127.0.0.1 - - [19/Jun/2021 17:25:45] "[37mGET /_dash-component-suites/dash_html_components/dash_html_components.v1_1_3m1617