This notebook simply builds our balanced and unbalanced training and testing datasets from our user information and user labels.  It performs high-level data exploration.

In [1]:
import pandas as pd
from sklearn.model_selection import train_test_split
import math

Import datasets

In [2]:
df_labels=pd.read_csv('./data/midterm-2018.tsv', sep='\t')
df_labels=df_labels.rename(columns={'2521260264':'user_id'})
df_labels.head()

Unnamed: 0,user_id,bot
0,2521267226,bot
1,2521271036,bot
2,2521301466,bot
3,2521307095,bot
4,2521308265,bot


In [3]:
df_users=pd.read_json('./data/midterm-2018_processed_user_objects.json')
df_users.head()

Unnamed: 0,probe_timestamp,user_id,screen_name,name,description,user_created_at,url,lang,protected,verified,geo_enabled,profile_use_background_image,default_profile,followers_count,friends_count,listed_count,favourites_count,statuses_count,tid
0,Tue Nov 06 20:35:08 2018,4107317134,danitheduck21,Dani🏳️‍🌈,Dani 💜 She/Her 💜 Randomness all over. Expect l...,2015-11-03 21:16:13,,en,0.0,False,False,False,False,481,870,26,6542,67025,1059907055421509632
1,Tue Nov 06 17:57:51 2018,4858296837,ncaraballoPR,Natalie Caraballo,"Things I don’t get tired of: Politics, Amy Win...",2016-01-28 20:03:51,,en,0.0,False,False,False,False,202,712,5,1515,158,1059867472810180609
2,Tue Nov 06 20:35:23 2018,232631847,drmendezmd,Wilson,"Latin american100%! Let fight for our country,...",2010-12-31 18:55:05,,en,0.0,False,True,True,True,278,342,4,4780,4029,1059907117094711296
3,Tue Nov 06 19:23:19 2018,16700555,ScottNevins,Scott Nevins,TV Personality & Host | Political/News Contrib...,2008-10-11 21:39:34,http://www.ScottNevins.com,en,0.0,True,False,True,False,29546,384,402,143163,53427,1059888980957650944
4,Tue Nov 06 20:35:24 2018,334443152,lild1206,D,,2011-07-13 03:13:52,,en,0.0,False,True,True,True,95,668,1,1178,1315,1059907122408898562


Merge our user objects and labels.

In [4]:
df=pd.merge(df_users, df_labels, on=['user_id'])
df.head()

Unnamed: 0,probe_timestamp,user_id,screen_name,name,description,user_created_at,url,lang,protected,verified,geo_enabled,profile_use_background_image,default_profile,followers_count,friends_count,listed_count,favourites_count,statuses_count,tid,bot
0,Tue Nov 06 20:35:08 2018,4107317134,danitheduck21,Dani🏳️‍🌈,Dani 💜 She/Her 💜 Randomness all over. Expect l...,2015-11-03 21:16:13,,en,0.0,False,False,False,False,481,870,26,6542,67025,1059907055421509632,human
1,Tue Nov 06 17:57:51 2018,4858296837,ncaraballoPR,Natalie Caraballo,"Things I don’t get tired of: Politics, Amy Win...",2016-01-28 20:03:51,,en,0.0,False,False,False,False,202,712,5,1515,158,1059867472810180609,human
2,Tue Nov 06 20:35:23 2018,232631847,drmendezmd,Wilson,"Latin american100%! Let fight for our country,...",2010-12-31 18:55:05,,en,0.0,False,True,True,True,278,342,4,4780,4029,1059907117094711296,human
3,Tue Nov 06 19:23:19 2018,16700555,ScottNevins,Scott Nevins,TV Personality & Host | Political/News Contrib...,2008-10-11 21:39:34,http://www.ScottNevins.com,en,0.0,True,False,True,False,29546,384,402,143163,53427,1059888980957650944,human
4,Tue Nov 06 20:35:24 2018,334443152,lild1206,D,,2011-07-13 03:13:52,,en,0.0,False,True,True,True,95,668,1,1178,1315,1059907122408898562,human


Look at the shape and information.

In [5]:
print(df.shape)

(50537, 20)


In [6]:
print(df.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 50537 entries, 0 to 50536
Data columns (total 20 columns):
 #   Column                        Non-Null Count  Dtype         
---  ------                        --------------  -----         
 0   probe_timestamp               50537 non-null  object        
 1   user_id                       50537 non-null  int64         
 2   screen_name                   50537 non-null  object        
 3   name                          50537 non-null  object        
 4   description                   22314 non-null  object        
 5   user_created_at               50537 non-null  datetime64[ns]
 6   url                           7473 non-null   object        
 7   lang                          50537 non-null  object        
 8   protected                     46994 non-null  float64       
 9   verified                      50537 non-null  bool          
 10  geo_enabled                   50537 non-null  bool          
 11  profile_use_background_image

In [7]:
df.describe()

Unnamed: 0,user_id,protected,followers_count,friends_count,listed_count,favourites_count,statuses_count,tid
count,50537.0,46994.0,50537.0,50537.0,50537.0,50537.0,50537.0,50537.0
mean,8.925762e+17,0.0,2646.671,291.853157,15.767556,2314.882522,2450.896076,1.057336e+18
std,3.635423e+17,0.0,232273.7,2671.700187,449.022183,12672.75653,13770.362987,6132038000000000.0
min,74613.0,0.0,0.0,0.0,0.0,0.0,1.0,7.960691e+17
25%,9.868175e+17,0.0,0.0,0.0,0.0,0.0,8.0,1.052275e+18
50%,1.052364e+18,0.0,1.0,9.0,0.0,0.0,30.0,1.057426e+18
75%,1.056739e+18,0.0,16.0,141.0,0.0,75.0,147.0,1.059911e+18
max,1.078964e+18,0.0,50865900.0,298234.0,67930.0,463821.0,599307.0,1.07897e+18


In [8]:
df.describe(include=['object', 'bool'])

Unnamed: 0,probe_timestamp,screen_name,name,description,url,lang,verified,geo_enabled,profile_use_background_image,default_profile,bot
count,50537,50537,50537.0,22314,7473,50537,50537,50537,50537,50537,50537
unique,49111,50531,42190.0,21861,7084,34,2,2,2,2,2
top,Wed Oct 10 14:17:55 2018,SMPotbury,,Afraid of nada,http://wmna.sh/bstexomashore,en,False,False,True,True,bot
freq,5,2,446.0,43,40,48005,50105,45867,47491,44045,42445


In [9]:
print(df['bot'].value_counts())

bot      42445
human     8092
Name: bot, dtype: int64


Drop columns that are likely to be irrelevant.

In [10]:
df.drop(columns=['probe_timestamp', 'user_id', 'screen_name', 'name', 'description', 'user_created_at', 'url', 'lang', 'protected', 'tid'], inplace=True)
print(df.shape)
print(df.info())

(50537, 10)
<class 'pandas.core.frame.DataFrame'>
Int64Index: 50537 entries, 0 to 50536
Data columns (total 10 columns):
 #   Column                        Non-Null Count  Dtype 
---  ------                        --------------  ----- 
 0   verified                      50537 non-null  bool  
 1   geo_enabled                   50537 non-null  bool  
 2   profile_use_background_image  50537 non-null  bool  
 3   default_profile               50537 non-null  bool  
 4   followers_count               50537 non-null  int64 
 5   friends_count                 50537 non-null  int64 
 6   listed_count                  50537 non-null  int64 
 7   favourites_count              50537 non-null  int64 
 8   statuses_count                50537 non-null  int64 
 9   bot                           50537 non-null  object
dtypes: bool(4), int64(5), object(1)
memory usage: 2.9+ MB
None


In [11]:
df.describe()

Unnamed: 0,followers_count,friends_count,listed_count,favourites_count,statuses_count
count,50537.0,50537.0,50537.0,50537.0,50537.0
mean,2646.671,291.853157,15.767556,2314.882522,2450.896076
std,232273.7,2671.700187,449.022183,12672.75653,13770.362987
min,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,8.0
50%,1.0,9.0,0.0,0.0,30.0
75%,16.0,141.0,0.0,75.0,147.0
max,50865900.0,298234.0,67930.0,463821.0,599307.0


In [12]:
df.describe(include=['object', 'bool'])

Unnamed: 0,verified,geo_enabled,profile_use_background_image,default_profile,bot
count,50537,50537,50537,50537,50537
unique,2,2,2,2,2
top,False,False,True,True,bot
freq,50105,45867,47491,44045,42445


Let's look at the differences between the two classes.

In [13]:
df[df['bot']=='bot'].describe()

Unnamed: 0,followers_count,friends_count,listed_count,favourites_count,statuses_count
count,42445.0,42445.0,42445.0,42445.0,42445.0
mean,19.216044,76.095158,0.086653,79.031594,123.407516
std,145.944639,232.620177,0.851544,757.281912,892.563302
min,0.0,0.0,0.0,0.0,1.0
25%,0.0,0.0,0.0,0.0,7.0
50%,0.0,1.0,0.0,0.0,20.0
75%,2.0,54.0,0.0,9.0,69.0
max,11598.0,10937.0,137.0,74484.0,80487.0


In [14]:
df[df['bot']=='bot'].describe(include=['object', 'bool'])

Unnamed: 0,verified,geo_enabled,profile_use_background_image,default_profile,bot
count,42445,42445,42445,42445,42445
unique,1,2,2,2,1
top,False,False,True,True,bot
freq,42445,42277,41306,41306,42445


In [15]:
df[df['bot']=='human'].describe()

Unnamed: 0,followers_count,friends_count,listed_count,favourites_count,statuses_count
count,8092.0,8092.0,8092.0,8092.0,8092.0
mean,16428.47,1423.569451,98.01866,14042.600346,14659.281142
std,580301.3,6540.2205,1118.595029,28918.900237,31665.67386
min,0.0,0.0,0.0,0.0,1.0
25%,107.0,211.0,1.0,930.75,1067.75
50%,362.0,478.0,7.0,3983.5,4351.5
75%,1345.25,1092.5,32.0,13780.5,14197.25
max,50865900.0,298234.0,67930.0,463821.0,599307.0


In [16]:
df[df['bot']=='human'].describe(include=['object', 'bool'])

Unnamed: 0,verified,geo_enabled,profile_use_background_image,default_profile,bot
count,8092,8092,8092,8092,8092
unique,2,2,2,2,1
top,False,True,True,False,human
freq,7660,4502,6185,5353,8092


Create unbalanced train and test dataframes

In [17]:
train_u, test_u = train_test_split(df, test_size=0.2)

In [18]:
if all(train_u.dtypes == test_u.dtypes):
  print("pass: data types are all same between train and test")
else:
  print('\nFAIL: DATA TYPES ARE NOT EQUAL BETWEEN TRAIN AND TEST, SOMETHING IS WRONG\n')

pass: data types are all same between train and test


Save our unbalanced training and test sets.

In [19]:
train_u.to_json('./data/unbal_train.json')
test_u.to_json('./data/unbal_test.json')

Create balanced train and test datframes

In [20]:
bots=df[df['bot']=='bot']
humans=df[df['bot']=='human']
bots_sample=bots.sample(8092)
bal_df=bots_sample.append(humans)

train_b, test_b = train_test_split(bal_df, test_size=0.2)

In [21]:
if all(train_b.dtypes == test_b.dtypes):
  print("pass: data types are all same between train and test")
else:
  print('\nFAIL: DATA TYPES ARE NOT EQUAL BETWEEN TRAIN AND TEST, SOMETHING IS WRONG\n')

pass: data types are all same between train and test


In [22]:
train_b.to_json('./data/bal_train.json')
test_b.to_json('./data/bal_test.json')

Create an unbalanced cropped dataset

In [24]:
crop_bots=bots.sample(math.floor(len(bal_df)*0.83))
crop_humans=humans.sample(len(bal_df)-len(crop_bots))
crop_df=crop_bots.append(crop_humans)
print(len(crop_df))

train_c, test_c = train_test_split(crop_df, test_size=0.2)

train_c.to_json('./data/crop_unbal_train.json')
test_c.to_json('./data/crop_unbal_test.json')


16184


Create an unbalanced dataset with more humans than bots

In [25]:
less_bots=bots.sample(math.floor(len(humans)/0.83-len(humans)))
more_humans=less_bots.append(humans)

train_m, test_m = train_test_split(more_humans, test_size=0.2)

train_m.to_json('./data/more_human_train.json')
test_m.to_json('./data/more_human_test.json')