# Dataset Analysis

The aim of this Jupyter Notebook is to navigate the Dataset used on the project for the users classification in Instagram. It is a labelled dataset where the label is the column 'real_account' with the possible values true (for the real users) and false (for the fake ones).

# Import the context (libraries) and the dataset (already processed)

In [51]:
import pandas as pd
import numpy as np

In [52]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

In [53]:
df_real = pd.read_csv('../data/balanced_real_data.csv')
df_fake = pd.read_csv('../data/balanced_fake_data.csv')

# Fast view of Real and Fake Users Datasets

 

In [54]:
df_real.head()

Unnamed: 0,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,is_joined_recently,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
0,True,45,2979,846,0.283988,278,False,False,False,False,...,4.964205,24.64333,1.759163,1688451.0,7930005.0,75731.0,1964306.0,3858496000000.0,1.947676,True
1,True,131,966,5172,5.354037,150,False,False,True,False,...,0.472582,0.2233333,2.951336,197835.2,851011.0,3855.0,222204.1,49374670000.0,1.785389,True
2,True,128,1662,1475,0.887485,403,False,False,True,False,...,5.961543,35.54,1.546427,295033.8,1128895.0,9212.0,241783.2,58459110000.0,1.726161,True
3,True,0,236,111,0.470339,43,True,False,False,False,...,,,,,,,,,,True
4,True,16,386,967516,2506.518135,34,False,True,True,False,...,1180.090381,1392613.0,0.319856,2418942.0,12726352.0,36.0,2814164.0,7919517000000.0,2.273966,True


In [55]:
df_fake.head()

Unnamed: 0,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,is_joined_recently,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
0,True,56,2321,747,0.321844,10,True,False,False,False,...,,,,,,,,,,False
1,True,147,4247,611,0.143866,99,False,False,True,False,...,0.678233,0.46,2.068805,80254.208333,1466742.0,10.0,297334.2,88407620000.0,4.488934,False
2,True,127,1764,1613,0.914399,64,False,False,False,False,...,10.224643,104.543333,3.196293,333885.0,836121.0,27637.0,224060.2,50202990000.0,0.519589,False
3,True,0,68,65,0.955882,1,False,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,True,80,242,134,0.553719,80,False,False,False,False,...,0.6245,0.39,4.086591,694503.0,6477346.0,8.0,1489132.0,2217513000000.0,2.800966,False


# Analysis of number of Private Users in each Dataset (Real and Fake)

Below of every table it is possible to see the number of entries

In [56]:
df_real[df_real['is_private']==True]

Unnamed: 0,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,is_joined_recently,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
3,True,0,236,111,0.470339,43,True,False,False,False,...,,,,,,,,,,True
20,True,70,426,397,0.931925,164,True,False,False,False,...,,,,,,,,,,True
25,True,0,530,708,1.335849,487,True,False,False,False,...,,,,,,,,,,True
31,True,0,485,222,0.457732,79,True,False,False,False,...,,,,,,,,,,True
35,True,133,1162,1166,1.003442,297,True,False,False,False,...,,,,,,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7629,True,29,759,667,0.878788,175,True,False,False,False,...,,,,,,,,,,True
7646,True,0,315,355,1.126984,15,True,False,False,False,...,,,,,,,,,,True
7649,True,55,377,145,0.384615,572,True,False,False,False,...,,,,,,,,,,True
7650,True,85,381,1881,4.937008,247,True,False,False,False,...,,,,,,,,,,True


In [57]:
df_fake[df_fake['is_private']==True]

Unnamed: 0,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,is_joined_recently,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
0,True,56,2321,747,0.321844,10,True,False,False,False,...,,,,,,,,,,False
14,True,33,19,26,1.368421,6,True,False,False,False,...,,,,,,,,,,False
19,False,51,2782,706,0.253774,7,True,False,False,False,...,,,,,,,,,,False
25,True,0,2618,323,0.123377,0,True,False,False,False,...,,,,,,,,,,False
37,True,0,1201,405,0.337219,16,True,False,False,False,...,,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7622,True,33,547,322,0.588665,8,True,False,False,False,...,,,,,,,,,,False
7627,True,0,155,139,0.896774,5,True,False,False,False,...,,,,,,,,,,False
7638,True,14,7077,899,0.127031,13,True,False,False,False,...,,,,,,,,,,False
7648,True,62,912,945,1.036184,513,True,False,False,False,...,,,,,,,,,,False


# Creation of the Final Dataset merging the ones above

In [58]:
# Merging the dataset with real and fake users
df=pd.concat([df_real,df_fake],ignore_index=True)

# Fix the inconsistensy in the Dataset  

In the project we had use two different scrapers for avoid the problem of the limited numbers of request from Instagram. The first scraper catch the users informations and the second one collect the media and compute the statistics. It is possible that sometimes between the two different times we used the two scraper some user switch from private to public and viceversa. It's not possibile to get information from private users (it's a consistency problem) so the solution adopted is to switch the attribute private to public since when we collected the datas the users were public. Another possibility is to have an old public profile become private: also in this case the solution adopted is to switch the attribute public to private.

In [59]:
def fix_private_entries(dataset):
    dataset.loc[(dataset['is_private']==True)&(dataset['min_likes'].notnull()),['is_private']] = False
    dataset.loc[(dataset['is_private'] ==False)&(dataset['min_likes'].isnull())&(dataset['media_count']!=0),['is_private']]=True
    return dataset

In [60]:
df.loc[df['is_private'] ==True]

Unnamed: 0,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,is_joined_recently,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
3,True,0,236,111,0.470339,43,True,False,False,False,...,,,,,,,,,,True
20,True,70,426,397,0.931925,164,True,False,False,False,...,,,,,,,,,,True
25,True,0,530,708,1.335849,487,True,False,False,False,...,,,,,,,,,,True
31,True,0,485,222,0.457732,79,True,False,False,False,...,,,,,,,,,,True
35,True,133,1162,1166,1.003442,297,True,False,False,False,...,,,,,,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15276,True,33,547,322,0.588665,8,True,False,False,False,...,,,,,,,,,,False
15281,True,0,155,139,0.896774,5,True,False,False,False,...,,,,,,,,,,False
15292,True,14,7077,899,0.127031,13,True,False,False,False,...,,,,,,,,,,False
15302,True,62,912,945,1.036184,513,True,False,False,False,...,,,,,,,,,,False


In [61]:
df.loc[(df['is_private'] ==True)&(df['min_likes'].notnull())]

Unnamed: 0,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,is_joined_recently,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
64,True,61,1113,628,0.564241,555,True,False,False,False,...,1.339154,1.793333,0.430254,338340.1,1035596.0,33293.0,210093.7,44139380000.0,1.591702,True
463,True,114,198,79655,402.29798,1524,True,False,False,False,...,20.938561,438.423333,2.255061,58434.46,162763.0,697.0,53688.29,2882433000.0,0.656763,True
563,True,60,353,1683,4.767705,492,True,False,False,False,...,19.979323,399.173333,1.573629,1465247.0,13550111.0,85677.0,2668473.0,7120749000000.0,4.110943,True
632,True,89,2082,6546,3.144092,249,True,False,False,False,...,14.857994,220.76,1.516364,530528.9,1841059.0,1626.0,673556.2,453678000000.0,0.946825,True
1892,True,100,953,626,0.656873,101,True,False,False,False,...,1.997498,3.99,1.026663,280234.5,1185088.0,112.0,294443.4,86696950000.0,1.386961,True
2017,True,147,902,2602,2.884701,642,True,False,False,False,...,2.314447,5.356667,-0.031184,238850.3,603786.0,33387.0,184874.4,34178530000.0,0.929054,True
2018,True,64,2039,338,0.165768,127,True,False,False,False,...,0.472582,0.223333,2.951336,352004.3,5810554.0,44.0,1259476.0,1586280000000.0,3.780921,True
2430,True,141,211,189,0.895735,235,True,False,False,False,...,2.598076,6.75,3.267823,921340.4,3271641.0,429.0,977913.0,956313800000.0,0.931225,True
2589,True,4,375,588,1.568,126,True,False,False,False,...,3.752777,14.083333,0.699061,3648195.0,13257047.0,75913.0,3729993.0,13912850000000.0,1.36247,True
3169,True,27,815,553,0.678528,228,True,False,False,False,...,2.527845,6.39,1.652588,3830910.0,13276510.0,40864.0,3664062.0,13425350000000.0,1.185874,True


In [62]:
df.loc[(df['is_private'] ==False)&(df['min_likes'].isnull())&(df['media_count']!=0)]

Unnamed: 0,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,is_joined_recently,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
343,True,54,687,2735,3.981077,269,False,False,True,False,...,,,,,,,,,,True
853,True,30,472,1411,2.989407,640,False,False,False,False,...,,,,,,,,,,True
2623,True,44,1347,1237,0.918337,224,False,False,False,False,...,,,,,,,,,,True
5277,True,0,13,270,20.769231,16,False,False,False,False,...,,,,,,,,,,True
5638,True,9,286,202,0.706294,10,False,False,False,False,...,,,,,,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
12995,True,145,6484,1213,0.187076,29,False,False,True,False,...,,,,,,,,,,False
13142,True,12,924,151,0.163420,21,False,False,True,False,...,,,,,,,,,,False
13770,True,97,6636,1973,0.297318,16811,False,False,False,False,...,,,,,,,,,,False
14783,False,0,4474,911,0.203621,1,False,False,True,False,...,,,,,,,,,,False


In [63]:
#Fixing the entries with private users (for scraping errors)
df=fix_private_entries(df)

In [64]:
# Fix the dataset with incorrect entries
df.loc[(df['is_private'] ==True)&(df['min_likes'].notnull())]

Unnamed: 0,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,is_joined_recently,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account


In [65]:
df.loc[(df['is_private'] ==False)&(df['min_likes'].isnull())&(df['media_count']!=0)]

Unnamed: 0,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,is_joined_recently,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account


In [66]:
features_columns=list(df.columns[:30])
# targets_column=list(df.columns[30:31])
print(features_columns)

['profile_pic', 'biography', 'follows_count', 'followed_by_count', 'ff_ratio', 'media_count', 'is_private', 'is_verified', 'is_business_account', 'is_joined_recently', 'highlight_reel_count', 'average_likes', 'max_likes', 'min_likes', 'std_likes', 'var_likes', 'skw_likes', 'average_comments', 'max_comments', 'min_comments', 'std_comments', 'var_comments', 'skw_comments', 'mean_time_between_posts', 'max_time_between_posts', 'min_time_between_posts', 'std_time_between_posts', 'var_time_between_posts', 'skw_time_between_posts', 'real_account']


In [67]:
df.shape

(15308, 30)

In [68]:
targets = df['real_account'].values
df=df.loc[:,'profile_pic':'skw_time_between_posts']

In [69]:
targets

array([ True,  True,  True, ..., False, False, False])