# Dataset Analysis

The aim of this Jupyter Notebook is to navigate the Dataset used on the project for the users classification in Instagram. It is a labelled dataset where the label is the column 'real_account' with the possible values true (for the real users) and false (for the fake ones).

# Import the context (libraries) and the dataset (already processed)

In [1]:
import pandas as pd
import numpy as np

In [2]:
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.preprocessing import StandardScaler

In [3]:
df_real = pd.read_csv('../data/balanced_real_data.csv')
df_fake = pd.read_csv('../data/balanced_fake_data.csv')

# Fast view of Real and Fake Users Datasets

 

In [4]:
df_real.head()

Unnamed: 0,username,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
0,irinashayk,True,11,597,13156189,22037.167504,1592,False,True,False,...,878.448591,771671.926667,2.252054,207708.3,611811.0,31545.0,171989.4,29580360000.0,1.075537,True
1,dementieva_a______,True,0,423,192,0.453901,50,True,False,False,...,,,,,,,,,,True
2,lorenz.fini,True,0,426,151,0.35446,37,False,False,False,...,2.827248,7.993333,4.049417,2232875.0,7637239.0,92.0,2328249.0,5420744000000.0,0.828757,True
3,jessicagiulia,True,128,450,694,1.542222,1144,False,False,False,...,1.683251,2.833333,0.963009,126078.5,343781.0,83938.0,80450.28,6472248000.0,1.596275,True
4,il_socio_aci,True,60,1,327728,327728.0,392,False,False,True,...,123.279128,15197.743333,2.519276,328715.3,2955259.0,64159.0,582898.5,339770700000.0,4.056672,True


In [5]:
df_fake.head()

Unnamed: 0,username,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
0,_s.a.v.a__v.l.a.d_,True,56,2321,747,0.321844,10,True,False,False,...,,,,,,,,,,False
1,pp_sport_julduz,True,147,4247,611,0.143866,99,False,False,True,...,0.678233,0.46,2.068805,80254.208333,1466742.0,10.0,297334.2,88407620000.0,4.488934,False
2,clark5.7lyfe,True,127,1764,1613,0.914399,64,False,False,False,...,10.224643,104.543333,3.196293,333885.0,836121.0,27637.0,224060.2,50202990000.0,0.519589,False
3,nata53149,True,0,68,65,0.955882,1,False,False,False,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,False
4,olgashatko.63,True,80,242,134,0.553719,80,False,False,False,...,0.6245,0.39,4.086591,694503.0,6477346.0,8.0,1489132.0,2217513000000.0,2.800966,False


# Analysis of number of Private Users in each Dataset (Real and Fake)

Below of every table it is possible to see the number of entries

In [6]:
df_real[df_real['is_private']==True]

Unnamed: 0,username,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
1,dementieva_a______,True,0,423,192,0.453901,50,True,False,False,...,,,,,,,,,,True
9,carmenmb99,True,24,1032,1054,1.021318,98,True,False,False,...,,,,,,,,,,True
10,msbritt319,True,0,605,320,0.528926,174,True,False,False,...,,,,,,,,,,True
15,ela.jiminx,True,21,1007,453,0.449851,8,True,False,False,...,,,,,,,,,,True
18,robert.tocilla,True,69,728,957,1.314560,18,True,False,False,...,,,,,,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7637,_gconti_,True,47,1281,1088,0.849336,219,True,False,False,...,,,,,,,,,,True
7643,maffucci_lorenzo2,True,0,677,393,0.580502,31,True,False,False,...,,,,,,,,,,True
7644,88fuckinkeys,True,30,835,1035,1.239521,257,True,False,False,...,,,,,,,,,,True
7650,silviabiscosi,True,0,662,179,0.270393,232,True,False,False,...,,,,,,,,,,True


In [7]:
df_fake[df_fake['is_private']==True]

Unnamed: 0,username,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
0,_s.a.v.a__v.l.a.d_,True,56,2321,747,0.321844,10,True,False,False,...,,,,,,,,,,False
14,hundalgeeta,True,33,19,26,1.368421,6,True,False,False,...,,,,,,,,,,False
19,dadi_borzooo,False,51,2782,706,0.253774,7,True,False,False,...,,,,,,,,,,False
25,itz_jonathan_1,True,0,2618,323,0.123377,0,True,False,False,...,,,,,,,,,,False
37,ana__chuma,True,0,1201,405,0.337219,16,True,False,False,...,,,,,,,,,,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
7622,umari.insta,True,33,547,322,0.588665,8,True,False,False,...,,,,,,,,,,False
7627,priyanshthawariya,True,0,155,139,0.896774,5,True,False,False,...,,,,,,,,,,False
7638,andrestavlar,True,14,7077,899,0.127031,13,True,False,False,...,,,,,,,,,,False
7648,kristina_zhuchka,True,62,912,945,1.036184,513,True,False,False,...,,,,,,,,,,False


# Creation of the Final Dataset merging the ones above

In [8]:
# Merging the dataset with real and fake users
df=pd.concat([df_real,df_fake],ignore_index=True)

# Fix the inconsistensy in the Dataset  

In the project we had use two different scrapers for avoid the problem of the limited numbers of request from Instagram. The first scraper catch the users informations and the second one collect the media and compute the statistics. It is possible that sometimes between the two different times we used the two scraper some user switch from public to private. It's not possibile to get information from private users (it's a consistency problem) so the solution adopted is to switch the attribute private to public since when we collected the datas the users were public. It's also possible to switch to private and delete the statistic informations dropping additional infos that could be interesting and useful for the classification problem.

In [9]:
def fix_private_entries(dataset):
    dataset.loc[(dataset['is_private']==True)&(dataset['min_likes'].notnull()),['is_private']] = False
    return dataset

In [10]:
df.loc[df['is_private'] ==True]

Unnamed: 0,username,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
1,dementieva_a______,True,0,423,192,0.453901,50,True,False,False,...,,,,,,,,,,True
9,carmenmb99,True,24,1032,1054,1.021318,98,True,False,False,...,,,,,,,,,,True
10,msbritt319,True,0,605,320,0.528926,174,True,False,False,...,,,,,,,,,,True
15,ela.jiminx,True,21,1007,453,0.449851,8,True,False,False,...,,,,,,,,,,True
18,robert.tocilla,True,69,728,957,1.314560,18,True,False,False,...,,,,,,,,,,True
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
15276,umari.insta,True,33,547,322,0.588665,8,True,False,False,...,,,,,,,,,,False
15281,priyanshthawariya,True,0,155,139,0.896774,5,True,False,False,...,,,,,,,,,,False
15292,andrestavlar,True,14,7077,899,0.127031,13,True,False,False,...,,,,,,,,,,False
15302,kristina_zhuchka,True,62,912,945,1.036184,513,True,False,False,...,,,,,,,,,,False


In [11]:
df.loc[(df['is_private'] ==True)&(df['min_likes'].notnull())]

Unnamed: 0,username,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account
257,giuliacann,True,61,1113,628,0.564241,555,True,False,False,...,1.339154,1.793333,0.430254,338340.1,1035596.0,33293.0,210093.7,44139380000.0,1.591702,True
260,ellensynlove,True,0,530,494,0.932075,109,True,False,False,...,1.691153,2.86,3.198243,1029883.0,4314974.0,864.0,1241301.0,1540829000000.0,1.462782,True
1196,nerepast,True,135,520,563,1.082692,94,True,False,False,...,4.148494,17.21,3.00112,571107.4,2197674.0,302.0,695005.7,483033000000.0,1.26638,True
1696,giorgi_10,True,2,669,888,1.327354,639,True,False,False,...,2.573584,6.623333,1.002724,438499.5,4035110.0,476.0,911701.1,831199000000.0,3.035207,True
1753,movimento5litri,True,123,81,1465587,18093.666667,8552,True,False,False,...,4229.990925,17892820.0,4.627252,21528.75,60477.0,4397.0,20779.2,431775000.0,1.070064,True
2068,_dilettacioni,True,0,354,890,2.514124,24,True,False,False,...,1.190999,1.418478,1.017891,2874744.0,18726342.0,96289.0,4172031.0,17405840000000.0,2.665449,True
2448,peg_alexander,True,80,2864,5055,1.765014,85,True,False,False,...,1.598958,2.556667,2.105953,1515798.0,9251686.0,23.0,2618323.0,6855614000000.0,1.845048,True
2487,rava.41,True,43,314,291,0.926752,101,True,False,False,...,0.957427,0.9166667,1.453641,4121823.0,12777378.0,255191.0,3531217.0,12469500000000.0,0.983721,True
2494,carloshernandz23,True,21,952,1441,1.513655,22,True,False,False,...,14.477205,209.5895,2.894582,5064787.0,17432517.0,106689.0,5551776.0,30822210000000.0,1.162411,True
2676,jero.rod,True,7,379,1205,3.17942,49,True,False,False,...,12.030932,144.7433,0.35922,3430008.0,12432499.0,343312.0,2956376.0,8740159000000.0,1.436364,True


In [12]:
#Fixing the entries with private users (for scraping errors)
df=fix_private_entries(df)

In [13]:
# Fix the dataset with incorrect entries
df.loc[(df['is_private'] ==True)&(df['min_likes'].notnull())]

Unnamed: 0,username,profile_pic,biography,follows_count,followed_by_count,ff_ratio,media_count,is_private,is_verified,is_business_account,...,std_comments,var_comments,skw_comments,mean_time_between_posts,max_time_between_posts,min_time_between_posts,std_time_between_posts,var_time_between_posts,skw_time_between_posts,real_account


In [14]:
features_columns=list(df.columns[:30])
# targets_column=list(df.columns[30:31])
print(features_columns)

['username', 'profile_pic', 'biography', 'follows_count', 'followed_by_count', 'ff_ratio', 'media_count', 'is_private', 'is_verified', 'is_business_account', 'is_joined_recently', 'highlight_reel_count', 'average_likes', 'max_likes', 'min_likes', 'std_likes', 'var_likes', 'skw_likes', 'average_comments', 'max_comments', 'min_comments', 'std_comments', 'var_comments', 'skw_comments', 'mean_time_between_posts', 'max_time_between_posts', 'min_time_between_posts', 'std_time_between_posts', 'var_time_between_posts', 'skw_time_between_posts']


In [15]:
df.shape

(15308, 31)

In [16]:
targets = df['real_account'].values
df=df.loc[:,'profile_pic':'skw_time_between_posts']

In [17]:
targets

array([ True,  True,  True, ..., False, False, False])