## Implementing Chris Doenlen's 'Bot Or Not' Python Module

Everything from `twitter_funcs.py` was cloned from Chris' [repository](https://github.com/scrapfishies/twitter-bot-detection).

I will use this to label each user as 'bot' (boolean 1/0).

In [1]:
import pandas as pd
import numpy as np
from copy import deepcopy
from twitter_funcs import *

# Imports
import os
import numpy as np
import pandas as pd

import pickle
import tweepy

from datetime import datetime
from secrets import api_secret_key, api_key, bearer_token
import re
import time
import csv

In [2]:
data = pd.read_pickle("pickle/n2_tokenized.pick")

In [3]:
data.head(3)

Unnamed: 0,trump,biden,hashtags,user_id,original,tweet
181142,False,True,[],2820503362,All these articles showing that Biden is in th...,showing joe_biden lead ignore still vote showi...
0,False,True,[],1312487180258820096,@FoxNews Lady Gaga’s a nobody. Can’t figure ou...,lady nobody figure life even see nobody help j...
4,False,True,[],2335763630,@The_Grupp “It is purely a fortuity that this ...,purely fortuity great mass casualty history jo...


I'm getting a rate limit error. according to twitter site, I can lookup 300 users per 15 minutes. Let's try that.

In [4]:
def chunks(user_ids, n):
    """Yield successive n-sized chunks from user_ids (iterable)."""
    lst = list(user_ids)
    for i in range(0, len(lst), n):
        yield lst[i:i + n]
    


Here's where we'll implement Chris Doenlen's 'Bot or Not' model.

In [5]:
with open("bot_model.pick", "rb") as read_file:
    xgb_model = pickle.load(read_file)

In [6]:
# time.sleep(60*15)

In [None]:
verifieds = {}
bot_probas = {}

all_users = list(data.user_id.unique())
print(f"Total number of users to scrape: {len(all_users)}")
exist = pd.read_csv("../data/user_stats.csv")
exist_users = list(exist.user_id.unique())
print(f"Number of users already scraped: {len(exist_users)}")

user_ids = []
for user in exist_users:
    if user in all_users:
        continue
    user_ids.append(user)
    
print(f"Preparing to identify bots for {len(user_ids)} users...")

user_id_chunks = list(chunks(user_ids, n=300))

# now get stats for new users

csv_file = open("../data/user_stats.csv", "a")
csv_writer = csv.writer(csv_file)
csv_writer.writerow(["user_id", "bot_proba", "verified"])
for chunk in user_id_chunks:
    print(f"Preparing chunk. Num users: {len(chunk)}")
    for user_id in chunk:
        print(f"Preparing user '{user_id}'")
        
        
        auth = tweepy.OAuthHandler(api_key, api_secret_key)
        api = tweepy.API(auth)
        
        try: # Gather features for bot/not bot model
            # Get user information from screen name
            user = api.get_user(user_id)

            # account features to return for predicton
            account_age_days = (datetime.now() - user.created_at).days
            verified = user.verified # will also use this in our data
            geo_enabled = user.geo_enabled
            default_profile = user.default_profile
            default_profile_image = user.default_profile_image
            favourites_count = user.favourites_count
            followers_count = user.followers_count
            friends_count = user.friends_count
            statuses_count = user.statuses_count
            average_tweets_per_day = np.round(statuses_count / account_age_days, 3)

            # manufactured features
            hour_created = int(user.created_at.strftime("%H"))
            network = np.round(np.log(1 + friends_count) * np.log(1 + followers_count), 3)
            tweet_to_followers = np.round(
                np.log(1 + statuses_count) * np.log(1 + followers_count), 3
            )
            follower_acq_rate = np.round(
                np.log(1 + (followers_count / account_age_days)), 3
            )
            friends_acq_rate = np.round(np.log(1 + (friends_count / account_age_days)), 3)

            # organizing list to be returned
            account_features = [
                verified, hour_created,geo_enabled,default_profile,default_profile_image,favourites_count,
                followers_count,friends_count,statuses_count,average_tweets_per_day,network,tweet_to_followers,
                follower_acq_rate,friends_acq_rate]

            if account_features == np.nan:
                proba = np.nan
                verified = np.nan
                csv_writer.writerow([user_id, proba, verified])
                continue

            else:
                user_m = np.matrix(account_features)
                proba = np.round(xgb_model.predict_proba(user_m)[:, 1][0] * 100, 2)
                verified = account_features[0]
                csv_writer.writerow([user_id, proba, verified])

        except:
            print(f'error encountered, skipping user {user_id}')
            proba = np.nan
            verified = np.nan
        
            csv_writer.writerow([user_id, proba, verified])
    print("Chunk complete. Waiting 15 minutes.")
    time.sleep(15*60+1)

csv_file.close()

Total number of users to scrape: 34383
Number of users already scraped: 20101
Preparing to identify bots for 13541 users...
Preparing chunk. Num users: 300
Preparing user '105937456'
Preparing user '46538055'
Preparing user '100625142'
Preparing user '1288981025364860928'
Preparing user '65067002'
Preparing user '2562614485'
Preparing user '2246207654'
Preparing user '1066144442241953792'
Preparing user '1007960516'
Preparing user '1118489465008336897'
Preparing user '1206954619789856769'
Preparing user '1243870145229008896'
Preparing user '1658455681'
Preparing user '989298900'
Preparing user '1267800950321876994'
Preparing user '807989050972000256'
Preparing user '838183322'
Preparing user '893684694759362561'
Preparing user '1282581117632000000'
Preparing user '709149862227484672'
Preparing user '3071943960'
Preparing user '761111078201810944'
Preparing user '757727571807051776'
Preparing user '1194715921908023296'
Preparing user '143975787'
Preparing user '2952274270'
Preparing use

Preparing user '1133485857841848320'
Preparing user '1171412312999288832'
Preparing user '2787215427'
Preparing user '500289229'
Preparing user '2213540755'
Preparing user '1840457510'
Preparing user '824171096'
Preparing user '821029492030377984'
Preparing user '3158618551'
Preparing user '898218643888197632'
Preparing user '97413558'
Preparing user '59810886'
Preparing user '2986724116'
Preparing user '1201176493'
Preparing user '812919639894331392'
Preparing user '1229203145676320769'
Preparing user '74333189'
Preparing user '16843499'
Preparing user '2380623400'
Preparing user '1257544806185799680'
Preparing user '1253818448410316802'
Preparing user '1290427170872066053'
Preparing user '1201662807860756480'
Preparing user '758569471351070720'
Preparing user '277524205'
Preparing user '1247962815035510784'
Preparing user '44748304'
Preparing user '3084606262'
Preparing user '1274357130427056129'
Preparing user '16458709'
Preparing user '195645207'
Preparing user '615915855'
Preparin

In [None]:
bot_probas

In [None]:
data['bot_proba'] = data['user_id'].map(bot_probas)
data['verifieds'] = data['user_id'].map(is_verified)
data.to_pickle("pickle/tw_proba_verif.pick")

In [None]:
verifieds

In [None]:
data['verified'] = data['user_id'].map(verifieds)
data['bot_proba'] = data['user_id'].map(bot_probas)
