In [1]:
import pandas as pd

In [2]:
import pickle

with open('processed_dataset/bot_accounts.pkl', 'rb') as f:
    bots = pickle.load(f)
    
with open('processed_dataset/nonbot_accounts.pkl', 'rb') as f:
    nonbots = pickle.load(f)

In [3]:
len(bots), len(nonbots)

(1003, 1130)

In [4]:
import re
import numpy as np
pf_bots = pd.DataFrame(bots)
pf_nonbots = pd.DataFrame(nonbots)
print(pf_bots.columns)

Index(['contributors_enabled', 'created_at', 'default_profile',
       'default_profile_image', 'description', 'entities', 'favourites_count',
       'follow_request_sent', 'followers_count', 'following', 'friends_count',
       'geo_enabled', 'has_extended_profile', 'id', 'id_str',
       'is_translation_enabled', 'is_translator', 'lang', 'listed_count',
       'location', 'name', 'notifications', 'profile_background_color',
       'profile_background_image_url', 'profile_background_image_url_https',
       'profile_background_tile', 'profile_banner_url', 'profile_image_url',
       'profile_image_url_https', 'profile_link_color',
       'profile_sidebar_border_color', 'profile_sidebar_fill_color',
       'profile_text_color', 'profile_use_background_image', 'protected',
       'screen_name', 'status', 'statuses_count', 'time_zone',
       'translator_type', 'url', 'utc_offset', 'verified'],
      dtype='object')


In [5]:
bot_features = pd.DataFrame([1]*len(bots), columns=['is_bot'])
nonbot_features = pd.DataFrame([0]*len(nonbots), columns=['is_bot'])
bot_features.head()

Unnamed: 0,is_bot
0,1
1,1
2,1
3,1
4,1


In [6]:
# Search for 'bot' text in account name or account description
bot_features['has_bot_string'] = [int(item) for item in
        ([(re.search(r'bot', item, flags=re.IGNORECASE) != None) for item in list(pf_bots['description'])] 
      or [(re.search(r'bot', item, flags=re.IGNORECASE) != None) for item in list(pf_bots['name'])])]
nonbot_features['has_bot_string'] = [int(item) for item in
        ([(re.search(r'bot', item, flags=re.IGNORECASE) != None) for item in list(pf_nonbots['description'])] 
      or [(re.search(r'bot', item, flags=re.IGNORECASE) != None) for item in list(pf_nonbots['name'])])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string
0,1,1
1,1,0
2,1,0
3,1,1
4,1,1


In [7]:
twitter_start_date = 2006
current_year = 2017

bot_features['account_age'] = [(current_year - (int(item.split(' ')[-1])))/(current_year - twitter_start_date) for item in list(pf_bots['created_at'])]
nonbot_features['account_age'] = [(current_year - (int(item.split(' ')[-1])))/(current_year - twitter_start_date) for item in list(pf_nonbots['created_at'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age
0,1,1,0.181818
1,1,0,0.0
2,1,0,0.272727
3,1,1,0.181818
4,1,1,0.181818


In [8]:
# Followers
max_followers = 100000
bot_features['followers'] = [min(item/max_followers, 1) for item in list(pf_bots['followers_count'])]
nonbot_features['followers'] = [min(item/max_followers, 1) for item in list(pf_nonbots['followers_count'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers
0,1,1,0.181818,0.01136
1,1,0,0.0,0.0
2,1,0,0.272727,3e-05
3,1,1,0.181818,0.00546
4,1,1,0.181818,0.00016


In [9]:
# Favourites
max_favourites = max(max(list(pf_bots['favourites_count'])), max(list(pf_nonbots['favourites_count'])))
bot_features['favourites'] = list(pf_bots['favourites_count'])/max_favourites
nonbot_features['favourites'] = list(pf_nonbots['favourites_count'])/max_favourites

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites
0,1,1,0.181818,0.01136,0.0
1,1,0,0.0,0.0,0.0
2,1,0,0.272727,3e-05,0.0
3,1,1,0.181818,0.00546,0.0
4,1,1,0.181818,0.00016,0.0


In [10]:
# Contributors_enabled
bot_features['contributors'] = [int(item) for item in list(pf_bots['contributors_enabled'])]
nonbot_features['contributors'] = [int(item) for item in list(pf_nonbots['contributors_enabled'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors
0,1,1,0.181818,0.01136,0.0,0
1,1,0,0.0,0.0,0.0,0
2,1,0,0.272727,3e-05,0.0,0
3,1,1,0.181818,0.00546,0.0,0
4,1,1,0.181818,0.00016,0.0,0


In [11]:
# Friends
max_friends = 100000
bot_features['friends'] = [min(item/max_friends, 1) for item in list(pf_bots['friends_count'])]
nonbot_features['friends'] = [min(item/max_friends, 1) for item in list(pf_nonbots['friends_count'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends
0,1,1,0.181818,0.01136,0.0,0,7e-05
1,1,0,0.0,0.0,0.0,0,0.00022
2,1,0,0.272727,3e-05,0.0,0,0.0
3,1,1,0.181818,0.00546,0.0,0,0.00013
4,1,1,0.181818,0.00016,0.0,0,0.0


In [12]:
# Geo
bot_features['geo'] = [int(item) for item in list(pf_bots['geo_enabled'])]
nonbot_features['geo'] = [int(item) for item in list(pf_nonbots['geo_enabled'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo
0,1,1,0.181818,0.01136,0.0,0,7e-05,0
1,1,0,0.0,0.0,0.0,0,0.00022,0
2,1,0,0.272727,3e-05,0.0,0,0.0,0
3,1,1,0.181818,0.00546,0.0,0,0.00013,0
4,1,1,0.181818,0.00016,0.0,0,0.0,0


In [13]:
# Extended profile
bot_features['has_extended_profile'] = [int(item) for item in list(pf_bots['has_extended_profile'])]
nonbot_features['has_extended_profile'] = [int(item) for item in list(pf_nonbots['has_extended_profile'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0
1,1,0,0.0,0.0,0.0,0,0.00022,0,0
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0


In [14]:
# is_translation_enabled
bot_features['is_translation_enabled'] = [int(item) for item in list(pf_bots['is_translation_enabled'])]
nonbot_features['is_translation_enabled'] = [int(item) for item in list(pf_nonbots['is_translation_enabled'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile,is_translation_enabled
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0,0
1,1,0,0.0,0.0,0.0,0,0.00022,0,0,0
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0,0
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0,1
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0,0


In [15]:
# lang == en
bot_features['lang'] = [int(item == 'en') for item in list(pf_bots['lang'])]
nonbot_features['lang'] = [int(item == 'en') for item in list(pf_nonbots['lang'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile,is_translation_enabled,lang
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0,0,1
1,1,0,0.0,0.0,0.0,0,0.00022,0,0,0,1
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0,0,1
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0,1,1
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0,0,0


In [16]:
# location != empty string
bot_features['location'] = [int(item != '') for item in list(pf_bots['location'])]
nonbot_features['location'] = [int(item != '') for item in list(pf_nonbots['location'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile,is_translation_enabled,lang,location
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0,0,1,0
1,1,0,0.0,0.0,0.0,0,0.00022,0,0,0,1,0
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0,0,1,0
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0,1,1,1
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0,0,0,0


In [17]:
# notifications enabled
bot_features['notifications'] = [int(item) for item in list(pf_bots['notifications'])]
nonbot_features['notifications'] = [int(item) for item in list(pf_nonbots['notifications'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile,is_translation_enabled,lang,location,notifications
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0,0,1,0,0
1,1,0,0.0,0.0,0.0,0,0.00022,0,0,0,1,0,0
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0,0,1,0,0
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0,1,1,1,0
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0,0,0,0,0


In [18]:
# Number of tweets
max_tweets = 100000
bot_features['tweets'] = [min(item/max_tweets, 1) for item in list(pf_bots['statuses_count'])]
nonbot_features['tweets'] = [min(item/max_tweets, 1) for item in list(pf_nonbots['statuses_count'])]

bot_features.head()


Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile,is_translation_enabled,lang,location,notifications,tweets
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0,0,1,0,0,0.24611
1,1,0,0.0,0.0,0.0,0,0.00022,0,0,0,1,0,0,1e-05
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0,0,1,0,0,0.0105
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0,1,1,1,0,0.05293
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0,0,0,0,0,0.32468


In [19]:
# Verified account
bot_features['verified'] = [int(item) for item in list(pf_bots['verified'])]
nonbot_features['verified'] = [int(item) for item in list(pf_nonbots['verified'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile,is_translation_enabled,lang,location,notifications,tweets,verified
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0,0,1,0,0,0.24611,0
1,1,0,0.0,0.0,0.0,0,0.00022,0,0,0,1,0,0,1e-05,0
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0,0,1,0,0,0.0105,0
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0,1,1,1,0,0.05293,0
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0,0,0,0,0,0.32468,0


In [20]:
# Is URL empty
bot_features['url'] = [int(item != None) for item in list(pf_bots['url'])]
nonbot_features['url'] = [int(item != None) for item in list(pf_nonbots['url'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile,is_translation_enabled,lang,location,notifications,tweets,verified,url
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0,0,1,0,0,0.24611,0,0
1,1,0,0.0,0.0,0.0,0,0.00022,0,0,0,1,0,0,1e-05,0,0
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0,0,1,0,0,0.0105,0,0
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0,1,1,1,0,0.05293,0,1
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0,0,0,0,0,0.32468,0,0


In [21]:
# Default profile
bot_features['default_profile'] = [int(item) for item in list(pf_bots['default_profile'])]
nonbot_features['default_profile'] = [int(item) for item in list(pf_nonbots['default_profile'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile,is_translation_enabled,lang,location,notifications,tweets,verified,url,default_profile
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0,0,1,0,0,0.24611,0,0,0
1,1,0,0.0,0.0,0.0,0,0.00022,0,0,0,1,0,0,1e-05,0,0,1
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0,0,1,0,0,0.0105,0,0,1
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0,1,1,1,0,0.05293,0,1,0
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0,0,0,0,0,0.32468,0,0,1


In [22]:
# Default profile image
bot_features['default_profile_image'] = [int(item) for item in list(pf_bots['default_profile_image'])]
nonbot_features['default_profile_image'] = [int(item) for item in list(pf_nonbots['default_profile_image'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile,is_translation_enabled,lang,location,notifications,tweets,verified,url,default_profile,default_profile_image
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0,0,1,0,0,0.24611,0,0,0,0
1,1,0,0.0,0.0,0.0,0,0.00022,0,0,0,1,0,0,1e-05,0,0,1,1
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0,0,1,0,0,0.0105,0,0,1,0
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0,1,1,1,0,0.05293,0,1,0,0
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0,0,0,0,0,0.32468,0,0,1,0


In [23]:
# Listed
max_listed = 10000
bot_features['listed'] = [min(item/max_listed, 1) for item in list(pf_bots['listed_count'])]
nonbot_features['listed'] = [min(item/max_listed, 1) for item in list(pf_nonbots['listed_count'])]

bot_features.head()

Unnamed: 0,is_bot,has_bot_string,account_age,followers,favourites,contributors,friends,geo,has_extended_profile,is_translation_enabled,lang,location,notifications,tweets,verified,url,default_profile,default_profile_image,listed
0,1,1,0.181818,0.01136,0.0,0,7e-05,0,0,0,1,0,0,0.24611,0,0,0,0,0.0002
1,1,0,0.0,0.0,0.0,0,0.00022,0,0,0,1,0,0,1e-05,0,0,1,1,0.0
2,1,0,0.272727,3e-05,0.0,0,0.0,0,0,0,1,0,0,0.0105,0,0,1,0,0.0003
3,1,1,0.181818,0.00546,0.0,0,0.00013,0,0,1,1,1,0,0.05293,0,1,0,0,0.0051
4,1,1,0.181818,0.00016,0.0,0,0.0,0,0,0,0,0,0,0.32468,0,0,1,0,0.0011


In [26]:
# Store DataFrames
bot_features.to_pickle('processed_dataset/bot_features.pkl')
nonbot_features.to_pickle('processed_dataset/nonbot_features.pkl')

In [28]:
numpy_bot = bot_features.as_matrix()
numpy_nonbot = nonbot_features.as_matrix()

In [34]:
np.save('processed_dataset/bot_features', numpy_bot)
np.save('processed_dataset/nonbot_features', numpy_nonbot)