## Implementing Chris Doenlen's 'Bot Or Not' Python Module

Everything from `twitter_funcs.py` was cloned from Chris' [repository](https://github.com/scrapfishies/twitter-bot-detection).

I will use this to label each user as 'bot' (boolean 1/0).

In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from twitter_funcs import *

# Imports
import os
import numpy as np
import pandas as pd

import pickle
import tweepy

from datetime import datetime
from secrets import api_secret_key, api_key, bearer_token
import re
import time
import csv

In [2]:
data = pd.read_pickle("pickle/n2_tokenized_eff.pick")

In [3]:
data.head(3)

Unnamed: 0,id,date,time,user_id,username,hashtags,trump,biden,original,tweet,num_tokens
181142,1323379284434669568,2020-11-02,21:39:43,2820503362,artistacriseida,[],False,True,All these articles showing that Biden is in th...,article showing joebiden lead ignore govote ma...,17
0,1323414585995526144,2020-11-02,23:59:59,1312487180258820096,annapieters17,[],False,True,@FoxNews Lady Gaga’s a nobody. Can’t figure ou...,foxnews lady nobody figure life nobody help jo...,11
4,1323414585232293888,2020-11-02,23:59:59,2335763630,kylechwatt,[],False,True,@The_Grupp “It is purely a fortuity that this ...,purely fortuity mass casualty event history jo...,9


I'm getting a rate limit error. according to twitter site, I can lookup 300 users per 15 minutes. Let's try that.

In [4]:
def chunks(user_ids, n):
    """Yield successive n-sized chunks from user_ids (iterable)."""
    lst = list(user_ids)
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
    


Here's where we'll implement Chris Doenlen's 'Bot or Not' model.

In [5]:
with open("bot_model.pick", "rb") as read_file:
    xgb_model = pickle.load(read_file)

In [6]:
time.sleep(60*15)

In [None]:
verifieds = {}
bot_probas = {}

all_users = list(data.user_id.unique())
print(f"Total number of users to scrape: {len(all_users)}")
exist = pd.read_csv("../data/user_stats.csv")
exist_users = list(exist.user_id.unique())
print(f"Number of users already scraped: {len(exist_users)}")

user_ids = []
for user in exist_users:
    if user in all_users:
        continue
    else:
        user_ids.append(user)
    
print(f"Preparing to identify bots for {len(user_ids)} users...")

user_id_chunks = list(chunks(user_ids, n=300))

# now get stats for new users

csv_file = open("../data/user_stats.csv", "a")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["user_id", "bot_proba", "verified"])
for chunk in user_id_chunks:
    print(f"Preparing chunk. Num users: {len(chunk)}")
    for user_id in chunk:
        print(f"Preparing user '{user_id}'")
        
        
        auth = tweepy.OAuthHandler(api_key, api_secret_key)
        api = tweepy.API(auth)
        
        try: # Gather features for bot/not bot model
            # Get user information from screen name
            user = api.get_user(user_id)

            # account features to return for predicton
            account_age_days = (datetime.now() - user.created_at).days
            verified = user.verified # will also use this in our data
            geo_enabled = user.geo_enabled
            default_profile = user.default_profile
            default_profile_image = user.default_profile_image
            favourites_count = user.favourites_count
            followers_count = user.followers_count
            friends_count = user.friends_count
            statuses_count = user.statuses_count
            average_tweets_per_day = np.round(statuses_count / account_age_days, 3)

            # manufactured features
            hour_created = int(user.created_at.strftime("%H"))
            network = np.round(np.log(1 + friends_count) * np.log(1 + followers_count), 3)
            tweet_to_followers = np.round(
                np.log(1 + statuses_count) * np.log(1 + followers_count), 3
            )
            follower_acq_rate = np.round(
                np.log(1 + (followers_count / account_age_days)), 3
            )
            friends_acq_rate = np.round(np.log(1 + (friends_count / account_age_days)), 3)

            # organizing list to be returned
            account_features = [
                verified, hour_created,geo_enabled,default_profile,default_profile_image,favourites_count,
                followers_count,friends_count,statuses_count,average_tweets_per_day,network,tweet_to_followers,
                follower_acq_rate,friends_acq_rate]

            if account_features == np.nan:
                proba = np.nan
                verified = np.nan
                csv_writer.writerow([user_id, proba, verified])
                continue

            else:
                user_m = np.matrix(account_features)
                proba = np.round(xgb_model.predict_proba(user_m)[:, 1][0] * 100, 2)
                verified = account_features[0]
                csv_writer.writerow([user_id, proba, verified])

        except:
            print(f'error encountered, skipping user {user_id}')
            proba = np.nan
            verified = np.nan
        
            csv_writer.writerow([user_id, proba, verified])
    print("Chunk complete. Waiting 15 minutes.")
    time.sleep(15*60+1)

csv_file.close()

Total number of users to scrape: 69839
Number of users already scraped: 20101
Preparing to identify bots for 6072 users...
Preparing chunk. Num users: 300
Preparing user '105937456'
Preparing user '46538055'
Preparing user '1288981025364860928'
Preparing user '65067002'
Preparing user '1066144442241953792'
Preparing user '1007960516'
Preparing user '1118489465008336897'
Preparing user '1243870145229008896'
Preparing user '989298900'
Preparing user '807989050972000256'
Preparing user '838183322'
Preparing user '1282581117632000000'
Preparing user '3071943960'
Preparing user '143975787'
Preparing user '2952274270'
Preparing user '252260932'
Preparing user '60591221'
Preparing user '1109578158934814726'
Preparing user '1268904953810235393'
Preparing user '256213677'
Preparing user '520923095'
Preparing user '398956523'
Preparing user '1265074835862028291'
Preparing user '3253566278'
Preparing user '375695666'
Preparing user '4520673922'
Preparing user '1319272482201763840'
Preparing user 

Preparing user '2208908697'
Preparing user '92007846'
Preparing user '569203466'
Preparing user '2842801283'
Preparing user '223647062'
Preparing user '934210591186661378'
Preparing user '219115571'
Preparing user '1079624037687590912'
Preparing user '863163460871499778'
Preparing user '487394203'
Preparing user '1234381916557791232'
Preparing user '1156256884644597761'
Preparing user '703646469643964420'
Preparing user '1057379539721076737'
Preparing user '1207696515336155143'
Preparing user '1321047314358804482'
Preparing user '593991500'
Preparing user '737458865818767360'
Preparing user '1286335808601878528'
Preparing user '1378143456'
Preparing user '212056045'
Preparing user '895950564'
Preparing user '1253622312634638337'
Preparing user '83445531'
error encountered, skipping user 83445531
Preparing user '995163837153202176'
Preparing user '400421942'
Preparing user '15527119'
Preparing user '1100545410018488321'
Preparing user '877129747'
Preparing user '1318192107681648640'
Pre

Preparing user '1274076657251278848'
Preparing user '832441396229312512'
Preparing user '2287103726'
Preparing user '865355752558481408'
Preparing user '1186282148401205248'
Preparing user '19408150'
Preparing user '214561638'
Preparing user '307168354'
Preparing user '1098955571636719616'
Preparing user '1274745968261562368'
Preparing user '1168319818648494080'
Preparing user '1201217504414240769'
Preparing user '849260532058271745'
Preparing user '1199409943251759106'
Preparing user '3418844352'
Preparing user '998980984707035136'
Preparing user '825483082618675200'
Preparing user '1288243347921010688'
Preparing user '787814937091604480'
Preparing user '1077026177372573696'
Preparing user '178494253'
Preparing user '316617810'
Preparing user '798643914832039936'
Preparing user '2348817804'
Preparing user '349506763'
Preparing user '1304192928806506496'
Preparing user '703420677941026817'
Preparing user '1241257069703041024'
Preparing user '1225949241073377280'
Preparing user '9403125

Preparing user '1221997897522335744'
Preparing user '27737365'
Preparing user '1184964035126546432'
Preparing user '1210310224734806023'
Preparing user '1255305312250277890'
Preparing user '1206589313351327746'
Preparing user '1242945424186638340'
Preparing user '1319517462203777024'
Preparing user '522384942'
Preparing user '2287226102'
Preparing user '862397514833592321'
Preparing user '47476339'
Preparing user '1323315923248320512'
Preparing user '1209469566662926336'
Preparing user '28633302'
Preparing user '403023454'
Preparing user '804098150650646530'
Preparing user '1298491472057171970'
Preparing user '3000458999'
Preparing user '987141826166579206'
Preparing user '1155660524631265281'
Preparing user '1099685754266095616'
Preparing user '1013449281213272064'
Preparing user '743979282'
Preparing user '1439618545'
Preparing user '1193950489186050048'
Preparing user '1308946861307449345'
Preparing user '782081846859280384'
Preparing user '1299448727917096960'
Preparing user '36116

Preparing user '610980533'
Preparing user '1077039234580545536'
Preparing user '1284961480866242566'
Preparing user '96224314'
Preparing user '20341782'
Preparing user '426024397'
Preparing user '4384107383'
Preparing user '58702820'
Preparing user '1098239435441811458'
Preparing user '1422670596'
Preparing user '1039524968659394560'
Preparing user '34750263'
Preparing user '2780758138'
Preparing user '2402357353'
Preparing user '1677253597'
Preparing user '701408020'
Preparing user '1144093434795483137'
Preparing user '51537642'
Preparing user '1303649795756699650'
Preparing user '1239739439053602818'
Preparing user '103664819'
Preparing user '1294626250833764355'
Preparing user '900903689669496832'
Preparing user '600936439'
Preparing user '258244886'
Preparing user '38366272'
Preparing user '1196200557800755200'
Preparing user '4745981483'
Preparing user '818077748627435521'
Preparing user '627571097'
Preparing user '1233494934939979778'
Preparing user '757668385'
Preparing user '23

In [None]:
bot_probas

In [None]:
data['bot_proba'] = data['user_id'].map(bot_probas)
data['verifieds'] = data['user_id'].map(is_verified)
data.to_pickle("pickle/tw_proba_verif.pick")

In [None]:
verifieds

In [None]:
data['verified'] = data['user_id'].map(verifieds)
data['bot_proba'] = data['user_id'].map(bot_probas)
