## Import

In [None]:
import math
import json

import numpy as np
import pandas as pd
import seaborn as sb

from collections import Counter
from datetime import datetime

from plotly.subplots import make_subplots
import plotly.graph_objects as go
import matplotlib.pyplot as plt
import plotly.express as px

## Dataset preparation

In [None]:
df=pd.read_csv('../users.csv',low_memory=False,lineterminator='\n')

In [None]:
df['created_at'] = pd.to_datetime(df['created_at'], format="%a %b %d %X %z %Y")

## Initial dataset study

In [None]:
df.info()
df

In [None]:
#Print aggregate statistics on features
print('Aggregated statistics on features')
df.describe()

## Boolean info visualization

In [None]:
fig = make_subplots(rows=2,cols=2,specs=[[{'type':'domain'}, {'type':'domain'}], 
                                         [{'type':'domain'}, {'type':'domain'}],])
names = ['Yes', 'No']

#protected
values = [sum(df['protected']), len(df) - sum(df['protected'])]
fig.add_trace(go.Pie(labels = names, values = values,title="Is the user protected?",textposition='inside')
              ,row=1,col=1)

#geo_enabled
values = [sum(df['geo_enabled']), len(df) - sum(df['geo_enabled'])]
fig.add_trace(go.Pie(labels = names, values = values,title="Is geolocation enabled?",textposition='inside')
              ,row=1,col=2)

#verified
values = [sum(df['verified']), len(df) - sum(df['verified'])]
fig.add_trace(go.Pie(labels = names, values = values,title="Is user verified?",textposition='inside')
              ,row=2,col=1)

#contributors_enabled
values = [sum(df['contributors_enabled']), len(df) - sum(df['contributors_enabled'])]
fig.add_trace(go.Pie(labels = names, values = values,title="Is contributors enabled?",textposition='inside')
              ,row=2,col=2)

fig.update_layout(title="Boolean info visualization")
fig.show()
fig = make_subplots(rows=2,cols=2,specs=[[{'type':'domain'}, {'type':'domain'}], 
                                         [{'type':'domain'}, {'type':'domain'}]])

                                         
#default_profile
values = [sum(df['default_profile']), len(df) - sum(df['default_profile'])]
fig.add_trace(go.Pie(labels = names, values = values,title="Is a default profile?",textposition='inside')
              ,row=1,col=1)

#default_profile_image
values = [sum(df['default_profile_image']), len(df) - sum(df['default_profile_image'])]
fig.add_trace(go.Pie(labels = names, values = values,title="Is the user using the default image profile?",textposition='inside')
              ,row=1,col=2)


#description
values = [sum(df['description'].isna()), len(df) - sum(df['description'].isna())]
fig.add_trace(go.Pie(labels = names, values = values,title="Has the user a desription?",textposition='inside')
              ,row=2,col=1)

#url
values = [sum(df['url'].isna()), len(df) - sum(df['url'].isna())]
fig.add_trace(go.Pie(labels = names, values = values,title="Has the user an url?",textposition='inside')
              ,row=2,col=2)

fig.show()
fig = make_subplots(rows=2,cols=2,specs=[[{'type':'domain'}, {'type':'domain'}], 
                                         [{'type':'domain'}, {'type':'domain'}]])
#Error code
values = [sum(df['code'].isna()),df.loc[df['code']==63,'code'].count(),df.loc[df['code']==50,'code'].count()]
fig.add_trace(go.Pie(labels = ['Nan','63','54'], values = values,title="Error codes",
              textposition='inside'),row=1,col=1)
values = [df.loc[df['code']==63,'code'].count(),df.loc[df['code']==50,'code'].count()]
fig.add_trace(go.Pie(labels = ['63','54'], values = values,title="Error codes without nan",
              textposition='inside'),row=1,col=2)

fig.show()

## List of the account with the most interanction

### Most follower

In [None]:
n =20

dfGroupUsername = df.groupby(by=['name']).sum()
dfGroupUsername.sort_values(by=['followers_count'],ascending=False,inplace=True)

fig = px.histogram(dfGroupUsername.head(n),y=dfGroupUsername.head(n).index,x='followers_count',orientation='h')
fig.update_layout(title = "The %d users with most follower"%n)
fig.update_yaxes(title = "Username")

fig.show()

### Most friends

In [None]:
n =20

dfGroupUsername = df.groupby(by=['name']).sum()
dfGroupUsername.sort_values(by=['friends_count'],ascending=False,inplace=True)

fig = px.histogram(dfGroupUsername.head(n),y=dfGroupUsername.head(n).index,x='friends_count',orientation='h')
fig.update_layout(title = "The %d users with most friends"%n)
fig.update_yaxes(title = "Username")

fig.show()

### Most statues

In [None]:
n =20

dfGroupUsername = df.groupby(by=['name']).sum()
dfGroupUsername.sort_values(by=['statuses_count'],ascending=False,inplace=True)

fig = px.histogram(dfGroupUsername.head(n),y=dfGroupUsername.head(n).index,x='statuses_count',orientation='h')
fig.update_layout(title = "The %d users with most statues"%n)
fig.update_yaxes(title = "Username")

fig.show()

## When the most user are created

In [None]:
dfCreation = df.resample('W', on='created_at').count().iloc[:,1].to_frame()

fig = px.histogram(dfCreation,x=dfCreation.index,y='id_str')
fig.show()