# 0. Initialize

## 0.1. Import Libraries

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, glob
import gzip
import random
import tqdm
import json
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

from IPython import display
import matplotlib as mpl
from matplotlib import pyplot as plt

## 0.2. DEFINE VARIABLES 

In [2]:
DATA_PATH = '/Users/Gamegaraj/Desktop/CS412/data/' # '<insert-your-training-data-path-here>'

ROUND = 2 # This project will have 3 rounds of predictions: 1,2,3
STUDENT_ID = '26772'#'<insert-your-id-here>'
PROJECT_CODE = 'CS4129a709ea5dfc4'#'<insert-your-code-here>' # Same code for the annotation eg. CS412xxxxx

## 0.3. Read Training & Evaluation Data

### 0.3.1. Get the labels for tweets

In [3]:
#trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH))
trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH), dtype={'tweet_id': str, 'isPolitical': str})
trainingTweetDf

Unnamed: 0,tweet_id,isPolitical
0,1597170281545551872,Yes
1,1431700027471192069,No
2,1566035577090281472,Yes
3,1591538690869940225,Yes
4,1583898169238167554,Yes
...,...,...
2995,1593539327623151619,Yes
2996,1393886554062524418,No
2997,1597925615092764672,Yes
2998,1585291418616176640,Yes


In [4]:
trainingTweetDf.isPolitical.value_counts()

Yes    2003
No      997
Name: isPolitical, dtype: int64

### 0.3.2. Get the labels for users

In [5]:
trainingUserDf = pd.read_csv('{}training-user.csv'.format(DATA_PATH))
#trainingUserDf = pd.read_csv('training-user.csv')
trainingUserDf

Unnamed: 0,screen_name,isBot
0,koftecancaddy,No
1,ahaber,No
2,selahat03949652,No
3,erdin06357062,No
4,bhct__necatii,No
...,...,...
2995,djblumenberg,No
2996,mel1sq,No
2997,eren_yz1,Yes
2998,ergnyildiz4,No


In [6]:
trainingUserDf.isBot.value_counts()

No     2424
Yes     576
Name: isBot, dtype: int64

### 0.3.3. Expand your dataset with metadata and tweets

In [7]:
# You can also expand training data by downloading your own labeled datasets following the link
# Download the documents under "Link to training data"

print('http://www.onurvarol.com/Annotation-CS412-202201/reports/report_{}.html'.format(PROJECT_CODE))

http://www.onurvarol.com/Annotation-CS412-202201/reports/report_CS4129a709ea5dfc4.html


In [8]:
### Add Annotation Data
codes = ['CS4129a709ea5dfc4','CS412a32e72d94b5f','CS412c255f188f1f1','CS4125691a2d1c16d','CS412aa4c69f55b37']

def mergeAnnotations(codes):
    df_users = pd.DataFrame()
    for i in codes:
        filename = 'annotated_users_{}.csv'.format(i)
        filename = ('{}'+filename)
        df = pd.read_csv(filename.format(DATA_PATH))
        df = df.drop(['url', 'isOrganizational', 'isTroll','gender'], axis=1)
        df = df.drop(df[df.isBot == 'Not sure'].index)
        df.rename(columns = {'Unnamed: 0':'screen_name'}, inplace = True)
        df.dropna(inplace=True)
        df_users = pd.concat([df,df_users])
    

    df_tweets = pd.DataFrame()
    for i in codes:
        filename = 'annotated_tweets_{}.csv'.format(i)
        filename = ('{}'+filename)
        df = pd.read_csv(filename.format(DATA_PATH))
        df = df.drop(['url', 'sentiment', 'isExperiential','isInsult','topics'], axis=1)
        df.rename(columns = {'Unnamed: 0':'tweet_id'}, inplace = True)
        df.dropna(inplace=True)
        df_tweets = pd.concat([df,df_tweets])  
        df_tweets['tweet_id'] = df_tweets['tweet_id'].astype(str)
    
    return df_users, df_tweets

In [9]:
df_users, df_tweets = mergeAnnotations(codes)

trainingUserDf = pd.concat([trainingUserDf,df_users],ignore_index=True)
trainingTweetDf = pd.concat([trainingTweetDf,df_tweets],ignore_index=True)

# 1. EXTRACT FEATURES
Under *1.1. Political Tweet Detection* and *1.2. Bot Detection*, we firstly collect raw data for processing. We then combine some of them (total_interactions = num_favorites + num_retweets) or use them to extract features (whether the tweet has one of the political entities @meralaksener, @kilicdarogluk etc.).

We expect you to collect more raw data from **tweet_metadata**, **user_profiles** and **user_tweets** files by creating a function as shown in below examples such as *check_if_retweet()* and using it while iterating over data as shown under *Merge Collected Features*.

We also expect you to create new variables as much as you can from the data in order to make your predictions more accurate. For example, you may want to check:

- The tweet sources that a user frequently uses
- Whether the user is a verified account or not

...

to assess whether **a user is a bot or not** and whether **a tweet is political or not**.

In [10]:
PATH_TO_DOWNLOADED = DATA_PATH # 'D:/Users/suuser/Desktop/Sabancı/CS412/spring-2022/project/'

## 1.1. Political Tweet Detection
This part stands for the feature extraction of tweets. We start with collecting the raw data from *tweet_metadata*, then use some of them to extract features.

### 1.1.1. Get Raw Data

#### 1.1.1.1. Check if Retweet

In [11]:
def check_if_retweet(tweet_metadata_line):
    is_retweet = 0
    retweeted_username = ''

    try:
        tweet_metadata_line['retweeted_status']
        retweeted_username = tweet_metadata_line['retweeted_status']['user']['screen_name']
        is_retweet = 1

    except KeyError:
        pass

    return is_retweet, retweeted_username

#### 1.1.1.1. Check if Reply

In [12]:
def check_if_reply(tweet_metadata_line):
    is_reply = 0
    replied_username = ''

    
    if(tweet_metadata_line['in_reply_to_screen_name'] != None ):
        replied_username = tweet_metadata_line['in_reply_to_screen_name']
        is_reply = 1



    return is_reply, replied_username

#### 1.1.1.1. Check if Verified

In [13]:
def check_if_verified(tweet_metadata_line):
     
    is_verified = tweet_metadata_line['user']['verified']  
    return is_verified

#### 1.1.1.2. Get Tweet Text

In [14]:
def get_tweet_text(tweet_metadata_line):
    text = tweet_metadata_line['text']
    
    return text

#### 1.1.1.3. Get Tweet ID

In [15]:
def get_tweet_id(tweet_metadata_line):
    id_str = tweet_metadata_line['id_str']
    
    return id_str

#### 1.1.1.4. Get Number of Mentions and Hashtags

In [16]:
def get_number_mentions_hashtags(tweet_metadata_line):
    names = []
    num_mentions = len(tweet_metadata_line['entities']['user_mentions'])
    if(num_mentions!=0):
        mentions = tweet_metadata_line['entities']['user_mentions']
        for mention in mentions:
            names.append(mention['screen_name'])
    num_hashtags = len(tweet_metadata_line['entities']['hashtags'])
    avg = 0
    if(num_hashtags!=0):
        count = 0
        hashtags = tweet_metadata_line['entities']['hashtags']
        for hashtag in hashtags:
             count += len(hashtag['text'])
        avg = count/len(hashtags)
    return num_mentions, names, num_hashtags, avg

#### 1.1.1.5. Get Number of Retweets and Favorites

In [17]:
def get_number_retweets_favorites(tweet_metadata_line):
    retweet_count = tweet_metadata_line['retweet_count']
    favorite_count = tweet_metadata_line['favorite_count']
    
    return retweet_count, favorite_count

#### 1.1.1.5. Get Number of Punctuations

In [18]:
def get_punc_num(text):
    punc = ['.', ',', '!', '?', ':', ';']
    count = 0
    for x in punc:
        if x in text:
            count += 1
    return count

In [19]:
def nonalpha(string):
  count = 0
  # check each character in the string
  for char in string:
     # increment by 1 if it's non-alphanumeric
     if not char.isalpha():
       count += 1           

  # Take whichever is smaller
  return count


#### 1.1.1.6. Get User Info

In [20]:
def get_user_info(tweet_metadata_line):
    id = tweet_metadata_line['user']['id_str']
    screen_name = tweet_metadata_line['user']['screen_name'].lower()
    description = tweet_metadata_line['user']['description']

    return id, screen_name, description

### 1.1.2. Derive Manually Crafted Features

#### 1.1.2.1. Check for political entity in text

In [21]:
def check_political_ent(text):
    
    # the list below can be modified and some new names may be added (or removed)
    # list_of_entities = ['meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag']
    list_of_entities = ['meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag','vedatbilgn','SavciSayan','fahrettinaltun','erdogan','devlet bahçeli','bahçeli','ülkü'
                        'tcbestepe','fuatoktay','suleymansoylu','ikalin1','ekrem_imamoglu','mansuryavas06','pkk','recep','tayyip','erdoğan','cb','#ak','cumhuriyet'
                        'hdp','terörist','chpkk','kadro','ek','tcbestepe','MevlutCavusoglu', 'ozdag', 'özdağ', 'TBMM','drfahrettinkoca', 'yenisafak', 'tayyip', 'cumhur', 'belediye', 'baskan', 'başkan', 'ulusal',
                        'odatv', 'suleyman', 'haskologlu', 'mansur', 'dbdevletbahceli', 'Ahmet_Davutoglu', 'babacan', 'gazetesozcu', 'imamoglu', 'imamoğlu', 'parlament', 'meclis', 
                        'savaş', 'eğitim', 'egitim', 'dolar', 'lira', 'enflasyon', 'euro', 'döviz', 'altın', 'benzin', 'atama', 'altılı masa', 'abd', 'avrupa', 'almanya', 'nato',
                        'sınır', 'göçmen', 'gocmen', 'sığınmacı', 'mülteci', 'mahkeme', 'kanun', 'ukrayna', 'rusya', 'komisyon', 'fetö', 'faiz', 'piyasa', 'banka', 'politik', 'toplantı', 'çiftçi',
                        'saray', 'demokrasi', 'faşist', 'kemal', 'rejim', 'özgürlük', 'koalisyon', 'egemen', 'kurultay', 'danıştay', 'davutoğlu', 'birleşmiş milletler',
                        'bahçeli', 'diplomasi', 'cem uzan', 'lgbt', 'seçim', '2023', 'ibb', 'cemaat', 'soylu', 'liberal', 'kapital', 'protesto', 'halk','seçim', 
                        'erken seçim','sandık','icraat','gençlik kolu','akp','chp','mhp', 'kayyum','anayasa','mahkeme','nebati','tcmb','merkez bankası','kılışdar','cumhurbaşkanı adayı','millet','zillet'
                        'rte','atama','EYT','atanamıyor','kanun','soruşturma','sorusturma','altılı masa','bakanım','af','genel af', 'skandal','yolsuzluk','ihale','zafer','genelaf','muhalefet','muhalif','bakan'
                       'meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag','vedatbilgn','SavciSayan','fahrettinaltun',
                        'tcbestepe','fuatoktay','suleymansoylu','ikalin1','ekrem_imamoglu','mansuryavas06','murat_kurum',
                       'MevlutCavusoglu', 'drfahrettinkoca', 'NureddinNebati','akaraismailoglu','DIBAliErbas','ismailcatakli',
                    'dbdevletbahceli', 'Ahmet_Davutoglu','deryayanikashb','suleymansoylu','06melihgokcek','mustafasentop','VahitKirisci',
                     'fuatoktay','Akparti','emineerdoğan','iletisim','bybekirbozdag','omerrcelik','kasapoglu']
    entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
    number_entities = len(entities_in_text)

    return number_entities

In [22]:
def description_entities(text):
    list_of_entities = ['meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag', 'parti', 'CHP', 'AKP', 'HDP', 'MHP', 'mustafa kemal', 'ekonomi', 'dolar', 'halk', 'afgan', 'suriye', 'islam', 'siyasal', 'terör', 'pkk', 'belediye', 'gençlik', 'fahrettin koca', 'bakanım', 'başkanım', 'T.C.', 'reis', 'demirtaş', 'öcalan', 'asker', 'mehmetçik', 'uzun adam', 'meral akşener', 'kemal kılıçdaroğlu', 'TBMM', 'ittifak', 'koalisyon', 'hükümet', 'devlet', 'bakan', 'dolar', 'euro', 'enflasyon', 'vekil', 'aday', 'seçim', 'kongre', 'meclis', 'miting', 'soylu', 'anayasa', 'mahkeme'
                       ,'meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag','vedatbilgn','SavciSayan','fahrettinaltun','erdogan','devlet bahçeli','bahçeli','ülkü'
                        'tcbestepe','fuatoktay','suleymansoylu','ikalin1','ekrem_imamoglu','mansuryavas06','pkk','recep','tayyip','erdoğan','cb','#ak','cumhuriyet'
                        'hdp','terörist','chpkk','kadro','ek','tcbestepe','MevlutCavusoglu', 'ozdag', 'özdağ', 'TBMM','drfahrettinkoca', 'yenisafak', 'tayyip', 'cumhur', 'belediye', 'baskan', 'başkan', 'ulusal',
                        'odatv', 'suleyman', 'haskologlu', 'mansur', 'dbdevletbahceli', 'Ahmet_Davutoglu', 'babacan', 'gazetesozcu', 'imamoglu', 'imamoğlu', 'parlament', 'meclis', 
                        'savaş', 'eğitim', 'egitim', 'dolar', 'lira', 'enflasyon', 'euro', 'döviz', 'altın', 'benzin', 'atama', 'altılı masa', 'abd', 'avrupa', 'almanya', 'nato',
                        'sınır', 'göçmen', 'gocmen', 'sığınmacı', 'mülteci', 'mahkeme', 'kanun', 'ukrayna', 'rusya', 'komisyon', 'fetö', 'faiz', 'piyasa', 'banka', 'politik', 'toplantı', 'çiftçi',
                        'saray', 'demokrasi', 'faşist', 'kemal', 'rejim', 'özgürlük', 'koalisyon', 'egemen', 'kurultay', 'danıştay', 'davutoğlu', 'birleşmiş milletler',
                        'bahçeli', 'diplomasi', 'cem uzan', 'lgbt', 'seçim', '2023', 'ibb', 'cemaat', 'soylu', 'liberal', 'kapital', 'protesto', 'halk','seçim', 
                        'erken seçim','sandık','icraat','gençlik kolu','akp','chp','mhp', 'kayyum','anayasa','mahkeme','nebati','tcmb','merkez bankası','kılışdar','cumhurbaşkanı adayı','millet','zillet'
                        'rte','atama','EYT','atanamıyor','kanun','soruşturma','sorusturma','altılı masa','bakanım','af','genel af', 'skandal','yolsuzluk','ihale','zafer','genelaf','muhalefet','muhalif','bakan'
                       'meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag','vedatbilgn','SavciSayan','fahrettinaltun',
                        'tcbestepe','fuatoktay','suleymansoylu','ikalin1','ekrem_imamoglu','mansuryavas06','murat_kurum',
                       'MevlutCavusoglu', 'drfahrettinkoca', 'NureddinNebati','akaraismailoglu','DIBAliErbas','ismailcatakli',
                    'dbdevletbahceli', 'Ahmet_Davutoglu','deryayanikashb','suleymansoylu','06melihgokcek','mustafasentop','VahitKirisci',
                     'fuatoktay','Akparti','emineerdoğan','iletisim','bybekirbozdag','omerrcelik','kasapoglu','AK','Parti','seçim','AKP','CHP','MHP','Zafer Partisi']
    entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
    number_entities = len(entities_in_text)

    return number_entities

In [23]:
def check_political_mention(ment):
    list_of_entities =['suleymansoylu', 'drfahrettinkoca','ekrem_imamoglu','RT_Erdogan','kilicdarogluk','meral_aksener',
                        'alibabacan','Ahmet_Davutoglu','umitozdag','mansuryavas06',
                         'RTErdogan','omerrcelik','suleymansoylu' ,'EmineErdogan' ,'deryayanikashb' ,'Akparti' ,'omerrcelik',
                       'bybekirbozdag','vedatbilgn','tcmeb','yilmaznazif','ayhan_ogan','ismailcatakli','hasandogan',
                        'osmannnurika','bbismailerdem','meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag','vedatbilgn','SavciSayan','fahrettinaltun',
                        'tcbestepe','fuatoktay','suleymansoylu','ikalin1','ekrem_imamoglu','mansuryavas06','murat_kurum',
                       'MevlutCavusoglu', 'drfahrettinkoca', 'NureddinNebati','akaraismailoglu','DIBAliErbas','ismailcatakli',
                    'dbdevletbahceli', 'Ahmet_Davutoglu','deryayanikashb','suleymansoylu','06melihgokcek','mustafasentop','VahitKirisci',
                     'fuatoktay','Akparti','emineerdoğan','iletisim','bybekirbozdag','omerrcelik','kasapoglu']
    entities_in_text = [ent for ent in list_of_entities if ent in ment]
    return len(entities_in_text)


In [24]:
def check_political_reply(rep):
    list_of_entities = ['suleymansoylu', 'drfahrettinkoca','ekrem_imamoglu','RT_Erdogan','kilicdarogluk','meral_aksener',
                        'alibabacan','Ahmet_Davutoglu','umitozdag','mansuryavas06',
                         'RTErdogan','omerrcelik','suleymansoylu' ,'EmineErdogan' ,'deryayanikashb' ,'Akparti' ,'omerrcelik',
                       'bybekirbozdag','vedatbilgn','tcmeb','yilmaznazif','ayhan_ogan','ismailcatakli','hasandogan',
                        'osmannnurika','bbismailerdem','meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag','vedatbilgn','SavciSayan','fahrettinaltun',
                        'tcbestepe','fuatoktay','suleymansoylu','ikalin1','ekrem_imamoglu','mansuryavas06','murat_kurum',
                       'MevlutCavusoglu', 'drfahrettinkoca', 'NureddinNebati','akaraismailoglu','DIBAliErbas','ismailcatakli',
                    'dbdevletbahceli', 'Ahmet_Davutoglu','deryayanikashb','suleymansoylu','06melihgokcek','mustafasentop','VahitKirisci',
                     'fuatoktay','Akparti','emineerdoğan','iletisim','bybekirbozdag','omerrcelik','kasapoglu']
    
    if(rep in list_of_entities):
        return 1
    else:
        return 0

In [25]:
def get_user_info_metadata_tweet(user_metadata_line):
    user_metadata_line = user_metadata_line['user']
    user_id = user_metadata_line['id_str']
    user_name = user_metadata_line['name']
    user_screen_name = user_metadata_line['screen_name'].lower()
    user_location = user_metadata_line['location']
    user_description = user_metadata_line['description']
    user_followers_count = user_metadata_line['followers_count']
    user_friends_count = user_metadata_line['friends_count']
    user_created_at = user_metadata_line['created_at']
    user_protected = user_metadata_line['protected']
    user_verified = user_metadata_line['verified']
    user_geo_enabled = user_metadata_line['geo_enabled']
    user_tweet_count = user_metadata_line['statuses_count']
    user_fav_count = user_metadata_line['favourites_count']
    
    
    dictionary = { 'user_followers_count':user_followers_count, 'user_friends_count':user_friends_count, 
                  'user_created_at':user_created_at, 'user_protected':user_protected,
                   'user_geo_enabled':user_geo_enabled, 
                  'user_tweet_count':user_tweet_count, 'user_fav_count':user_fav_count}

    return dictionary

#### 1.1.2.2. Number of total interactions

In [26]:
def total_interactions(retweet_count, favorite_count):
    total_num_interactions = retweet_count + favorite_count
    
    return total_num_interactions

#### 1.2.1.1. Get user info metadata

In [27]:
def get_user_info_metadata(user_metadata_line):
    
    user_id = user_metadata_line['id_str']
    user_name = user_metadata_line['name']
    user_screen_name = user_metadata_line['screen_name'].lower()
    user_location = user_metadata_line['location']
    user_description = user_metadata_line['description']
    user_followers_count = user_metadata_line['followers_count']
    user_friends_count = user_metadata_line['friends_count']
    user_created_at = user_metadata_line['created_at']
    user_protected = user_metadata_line['protected']
    user_verified = user_metadata_line['verified']
    user_geo_enabled = user_metadata_line['geo_enabled']
    user_tweet_count = user_metadata_line['statuses_count']
    user_fav_count = user_metadata_line['favourites_count']
    
    
    dictionary = {'user_id':user_id, 'user_name': user_name, 'user_screen_name':user_screen_name, 'user_location':user_location,
     'user_description':user_description, 'user_followers_count':user_followers_count, 'user_friends_count':user_friends_count, 
                  'user_created_at':user_created_at, 'user_protected':user_protected,
                  'user_verified': user_verified, 'user_geo_enabled':user_geo_enabled, 
                  'user_tweet_count':user_tweet_count, 'user_fav_count':user_fav_count}

    return dictionary

#### 1.2.1.2. Get followers/(followers+friends) ratio

In [28]:
def get_followers_all_ratio(user_followers_count, user_friends_count):
    
    if user_friends_count + user_followers_count == 0:
        followers_all_ratio = 0

    else:
        followers_all_ratio =  user_followers_count / (user_friends_count + user_followers_count)

    return followers_all_ratio

#### 1.2.1.3. Get description length

In [29]:
def get_desc_len(user_description):
    
    description_len = len(user_description)

    return description_len

#### 1.2.1.3. Get since

In [30]:
import time
from datetime import date
from datetime import datetime

In [31]:
def get_since(user_created_at):
    if user_created_at != None:
        creation_date = datetime.strptime(user_created_at ,'%a %b %d %H:%M:%S +0000 %Y')
        since = datetime.now() - creation_date
    else:
        since = 0
    return int(since.days)

#### 1.1.1.5. Get Number of Digits

In [32]:
def num_of_digits(username):
    count = 0

    for x in username:
        if x.isdigit():
            count+=1

    return count

### 1.1.2. Collect data using the functions above and transform into a Pandas DataFrame

In [33]:
dfPolitical = {'tweet_id':[],
              'is_retweet':[],
              'retweeted_username':[],
               'is_reply':[],
               'replied_username':[],
              'text':[],
              'num_mentions':[],
              'num_hashtags':[],
              'num_retweets':[],
              'num_favorites':[],
              'user_id':[],
              'user_screen_name':[],
              'user_description':[],
              'num_political_entities':[],
              'total_interactions':[],
              'punctuation': [],
              'tweet_length' : [],
              'is_verified': [],
                'is_political_mention': [],
                'description_entities':[],
               'description_length':[],
               'is_political_reply': [],
               'is_political_retweet':[],
               'avg_hash': [],
               
            'user_followers_count':[], 'user_friends_count':[], 
                  'user_created_at':[], 'user_protected':[],
                  'user_geo_enabled':[], 
                  'user_tweet_count':[], 'user_fav_count':[],'since':[]
               ,'average_tweet':[],'average_fav':[],'count_digits':[],'followers_to_all_ratio':[],
               'count_digits_tweets':[],'count_digits_desc':[],'punctuation_desc': [],'nonalpha': []
               
               
               
              }


with gzip.open(f"{PATH_TO_DOWNLOADED}tweet_metadata.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)
        
        
        dictionary = get_user_info_metadata_tweet(line)
        for k,v in dictionary.items():
            dfPolitical[k].append(v)
      
        
        # raw data:
        id_str = get_tweet_id(line)
        is_retweet, retweeted_username = check_if_retweet(line)
        is_reply, replied_username = check_if_reply(line)
        text = get_tweet_text(line)
        num_mentions, mention ,num_hashtags, avgh = get_number_mentions_hashtags(line)
        political_mention = check_political_mention(mention)
        political_reply = check_political_reply(replied_username)
        political_retweet = check_political_reply(retweeted_username)
        retweet_count, favorite_count = get_number_retweets_favorites(line)
        user_id_str, screen_name, user_description = get_user_info(line)
        punctuation = get_punc_num(text)
        is_verified = check_if_verified(line)
        tweet_length = len(text)
        description_length = len(user_description)
        # manually crafted data:
        num_political_entities = check_political_ent(text)
        total_num_interactions = total_interactions(retweet_count, favorite_count)
        dfPolitical['tweet_id'].append(id_str)
        dfPolitical['avg_hash'].append(avgh)
        dfPolitical['is_retweet'].append(is_retweet)
        dfPolitical['retweeted_username'].append(retweeted_username)
        dfPolitical['is_reply'].append(is_reply)
        dfPolitical['replied_username'].append(replied_username)
        dfPolitical['text'].append(text)
        dfPolitical['num_mentions'].append(num_mentions)
        dfPolitical['is_political_mention'].append(political_mention)
        dfPolitical['is_political_reply'].append(political_reply)
        dfPolitical['is_political_retweet'].append(political_retweet)
        dfPolitical['num_hashtags'].append(num_hashtags)
        dfPolitical['num_retweets'].append(retweet_count)
        dfPolitical['num_favorites'].append(favorite_count)
        dfPolitical['user_id'].append(user_id_str)
        dfPolitical['user_screen_name'].append(screen_name)
        dfPolitical['description_entities'].append(description_entities(user_description))
        dfPolitical['user_description'].append(user_description)
        dfPolitical['num_political_entities'].append(num_political_entities)
        dfPolitical['total_interactions'].append(total_num_interactions)
        dfPolitical['punctuation'].append(punctuation)
        dfPolitical['tweet_length'].append(tweet_length)
        dfPolitical['description_length'].append(tweet_length)
        dfPolitical['is_verified'].append(is_verified)
        dfPolitical['nonalpha'].append(nonalpha(text))
        dfPolitical['punctuation_desc'].append(get_punc_num(user_description))
        
          
        since = get_since(dictionary['user_created_at'])
        dfPolitical['since'].append(since)
        
        followers_all_ratio = get_followers_all_ratio(dictionary['user_followers_count'], 
                                                      dictionary['user_friends_count'])
        
        dfPolitical['average_tweet'].append(dictionary['user_tweet_count'] / since)
        dfPolitical['average_fav'].append(dictionary['user_fav_count'] / since)
        
        dfPolitical['count_digits'].append(num_of_digits(screen_name))
        dfPolitical['count_digits_tweets'].append(num_of_digits(text))
        dfPolitical['count_digits_desc'].append(num_of_digits(user_description))

        dfPolitical['followers_to_all_ratio'].append(followers_all_ratio)

In [34]:
dfPolitical = pd.DataFrame(dfPolitical)
dfPolitical

Unnamed: 0,tweet_id,is_retweet,retweeted_username,is_reply,replied_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,num_political_entities,total_interactions,punctuation,tweet_length,is_verified,is_political_mention,description_entities,description_length,is_political_reply,is_political_retweet,avg_hash,user_followers_count,user_friends_count,user_created_at,user_protected,user_geo_enabled,user_tweet_count,user_fav_count,since,average_tweet,average_fav,count_digits,followers_to_all_ratio,count_digits_tweets,count_digits_desc,punctuation_desc,nonalpha
0,1588568792984346624,0,,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,1,147,3,140,False,0,1,140,0,0,0.0,284,539,Thu Oct 19 10:43:55 +0000 2017,False,True,2638,35673,1923,1.371815,18.550702,0,0.345079,1,0,0,21
1,1588452263047069697,0,,1,mahirunal,"@mahirunal Gavur İzmir ya onlar, hani Cumhuriy...",1,0,0,0,595514060,mtfdan,,1,0,2,97,False,0,0,97,0,0,0.0,131,589,Thu May 31 14:08:26 +0000 2012,False,False,6647,601,3889,1.709180,0.154538,0,0.181944,0,0,0,15
2,1569589330544398336,0,,0,,#ŞehitAdayıUzmÇvşaKadro\nSiz İstesenizde Istem...,0,1,0,0,1356375754561490947,ahsucilginuzman,Vatan Sevdalisi,3,0,3,140,False,0,0,140,0,0,22.0,60,118,Mon Feb 01 22:56:06 +0000 2021,False,False,2924,1230,721,4.055479,1.705964,0,0.337079,2,0,0,26
3,1570428119609139201,0,,1,ajans_muhbir,@ajans_muhbir Siz kaypak olmayıp onay vermesey...,1,0,0,0,1478775431008595968,hamitelkelle,HighOne,1,0,2,140,False,0,0,140,0,0,0.0,2,69,Wed Jan 05 17:08:49 +0000 2022,False,False,1783,608,383,4.655352,1.587467,0,0.028169,1,0,0,26
4,1551163840368414722,0,,0,,Engelli öğretmenler olarak önümüzdeki engeller...,0,0,0,0,1511976696337113088,sed58417690,,1,0,3,140,False,0,0,140,0,0,0.0,119,166,Thu Apr 07 07:58:42 +0000 2022,False,False,7559,2427,292,25.886986,8.311644,8,0.417544,3,0,0,21
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33528,1568595408233832448,0,,0,,Gerçek kimlik taşımayan hesaplara cevap vermem...,0,0,9,81,576247173,ardanzenturk,RT ONAYLADIĞIM ANLAMINA GELMEZ\nArtık fikirler...,1,90,3,140,False,0,1,140,0,0,0.0,171399,4147,Thu May 10 11:47:23 +0000 2012,False,True,41342,4990,3911,10.570698,1.275889,0,0.976377,0,3,2,24
33529,1584027427696959488,0,,1,umitozdag,@umitozdag Neden Suriyelilerle ilgili bu kadar...,1,0,1,8,162308585,ozgul_61,Bridge design engineer Yaay hesabı : dilfiruz,3,9,2,140,False,2,0,140,1,0,0.0,3936,3227,Sat Jul 03 07:58:43 +0000 2010,False,True,52340,193138,4588,11.408021,42.096338,2,0.549490,4,0,1,29
33530,1585945783307730945,0,,1,celebimehmeta,@celebimehmeta Niye Türkiye yüzyılıda.Türkiye ...,1,0,0,1,415025519,ladrekova,,1,1,2,76,False,0,0,76,0,0,0.0,121,412,Thu Nov 17 20:15:55 +0000 2011,False,True,1225,1418,4085,0.299878,0.347124,0,0.227017,0,0,0,10
33531,1569748909521801221,1,muazzezeralp,0,,RT @muazzezeralp: @Doan58213655 @denizkonur @N...,7,1,6,0,1442125177727307781,yapikytgrivrlsn,,6,6,1,140,False,3,0,140,0,0,17.0,591,1003,Sun Sep 26 13:53:55 +0000 2021,False,False,153819,43990,484,317.807851,90.888430,0,0.370765,16,0,0,38


In [35]:
dfPolitical.describe()

Unnamed: 0,is_retweet,is_reply,num_mentions,num_hashtags,num_retweets,num_favorites,num_political_entities,total_interactions,punctuation,tweet_length,is_political_mention,description_entities,description_length,is_political_reply,is_political_retweet,avg_hash,user_followers_count,user_friends_count,user_tweet_count,user_fav_count,since,average_tweet,average_fav,count_digits,followers_to_all_ratio,count_digits_tweets,count_digits_desc,punctuation_desc,nonalpha
count,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0,33533.0
mean,0.137536,0.655742,1.446187,0.167358,173.729162,7.060388,2.505144,180.789551,1.573256,102.203919,1.075239,0.700385,102.203919,0.210062,0.04002,2.150042,13432.83,1231.641905,31819.02,27537.88,1515.17195,34.372647,29.121734,1.960845,0.422808,2.261653,0.38571,0.435213,19.997823
std,0.344418,0.475133,1.567444,0.525753,1041.260855,108.140531,2.506287,1047.540331,1.065874,42.245978,1.556533,1.56608,42.245978,0.407358,0.19601,6.136218,178048.7,3523.541544,568478.1,64348.11,1425.757889,193.230222,59.728075,2.772765,0.224282,2.961814,1.633528,0.770801,9.737021
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,0.0,0.0,4.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,54.0,0.000253,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0,1.0,62.0,0.0,0.0,62.0,0.0,0.0,0.0,85.0,143.0,1990.0,1625.0,358.0,1.921723,1.886516,0.0,0.273381,0.0,0.0,0.0,11.0
50%,0.0,1.0,1.0,0.0,0.0,0.0,2.0,1.0,2.0,125.0,0.0,0.0,125.0,0.0,0.0,0.0,266.0,369.0,6938.0,7036.0,916.0,7.9747,8.554939,1.0,0.426174,1.0,0.0,0.0,22.0
75%,0.0,1.0,2.0,0.0,1.0,1.0,4.0,5.0,2.0,140.0,2.0,1.0,140.0,0.0,0.0,0.0,992.0,1041.0,22277.0,26230.0,2525.0,30.695871,29.869262,3.0,0.516544,3.0,0.0,1.0,27.0
max,1.0,1.0,12.0,12.0,26401.0,8469.0,20.0,26401.0,5.0,152.0,15.0,18.0,152.0,1.0,1.0,48.0,9213455.0,181198.0,51697350.0,3141282.0,5790.0,16638.992276,3185.884381,14.0,1.0,55.0,49.0,5.0,101.0


## 1.2. From Users

### 1.2.1. Get user metadata from user_profiles.jsons.gz

In [36]:
dfBot = {'user_id':[],
         'user_name':[],
         'user_screen_name':[],
         'user_location':[],
         'user_description':[],
         'user_followers_count':[],
         'user_friends_count':[],
         'description_len':[],
         'followers_to_all_ratio':[],
        'user_created_at':[],
        'user_protected':[],
        'user_verified':[],
         'user_geo_enabled':[],
         'since':[],
         'user_tweet_count':[],
         'user_fav_count':[],
        'average_tweet': [],
        'average_fav': [],
        'count_digits': []}


with gzip.open(f"{PATH_TO_DOWNLOADED}user_profiles.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)

        dictionary = get_user_info_metadata(line)
        for k,v in dictionary.items():
            dfBot[k].append(v)

        
        # manually crafted data:
        description_len = get_desc_len(dictionary['user_description'])
        dfBot['description_len'].append(description_len)
        
        since = get_since(dictionary['user_created_at'])
        dfBot['since'].append(since)
        
        followers_all_ratio = get_followers_all_ratio(dictionary['user_followers_count'], 
                                                      dictionary['user_friends_count'])
        
        dfBot['average_tweet'].append(dictionary['user_tweet_count'] / since)
        dfBot['average_fav'].append(dictionary['user_fav_count'] / since)
        
        dfBot['count_digits'].append(num_of_digits(dictionary['user_screen_name']))

        dfBot['followers_to_all_ratio'].append(followers_all_ratio)

In [37]:
dfBot = pd.DataFrame(dfBot)
dfBot

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_created_at,user_protected,user_verified,user_geo_enabled,since,user_tweet_count,user_fav_count,average_tweet,average_fav,count_digits
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260000,Fri Aug 27 13:07:30 +0000 2021,False,False,True,515,2551,17676,4.953398,34.322330,3
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,Fri Sep 11 08:45:44 +0000 2020,False,False,True,865,42771,15474,49.446243,17.889017,0
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,64,0.192308,Wed Apr 10 18:15:31 +0000 2019,False,False,False,1384,14300,18220,10.332370,13.164740,8
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,65,0.325203,Fri Jan 29 11:01:25 +0000 2016,False,False,False,2552,21303,26999,8.347571,10.579545,0
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,23,0.505051,Sun Dec 01 18:16:41 +0000 2013,False,False,False,3340,1629,2179,0.487725,0.652395,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,0,0.513453,Mon Oct 26 21:08:22 +0000 2020,False,False,False,819,2396,10820,2.925519,13.211233,0
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,100,0.975088,Wed Feb 03 18:39:01 +0000 2010,False,False,True,4737,75178,36671,15.870382,7.741398,0
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,28,0.451362,Fri May 01 13:56:23 +0000 2009,False,False,False,5015,6482,7389,1.292522,1.473380,0
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,116,0.314431,Fri Mar 14 18:05:09 +0000 2014,False,False,True,3237,121113,140095,37.415199,43.279271,0


### 1.2.2. Get Tweet Info of Users in user_profiles.jsons.gz

#### 1.2.2.1. Check ratio of retweets to all tweets

In [38]:
def get_retweet_tweet_ratio(line):
    number_retweets = 0
    number_original_tweets = 0

    for tweet in line['tweets']:
        try:
            tweet['retweeted_status']
            number_retweets += 1
                
        except:
            number_original_tweets += 1
            
    total_tweets = number_retweets + number_original_tweets
    
    if total_tweets == 0:
        retweet_total_ratio = None
    else:
        retweet_total_ratio = number_retweets/(total_tweets)
    
    return retweet_total_ratio

#### 1.2.2.2. Check median number of favorites

In [39]:
def get_median_number_favorites(line):
    num_median_favorites = np.median([tweet['favorite_count'] for tweet in line['tweets']])

    return num_median_favorites

In [40]:
def get_avg_length(line):
    if len(line["tweets"]) == 0:
        return 0

    avg = np.mean([len(tweet['text']) for tweet in line['tweets']])

    return avg

#check max length of tweets

def get_max_length(line):

    if len(line["tweets"]) == 0:
        return 0

    m = np.max([len(tweet['text']) for tweet in line['tweets']])

    return m


def get_std_length(line):

    if len(line["tweets"]) == 0:
        return 0

    m = np.std([len(tweet['text']) for tweet in line['tweets']])

    return m

In [41]:
import re, math 
from collections import Counter 
WORD = re.compile(r'\w+') 
def cosine_sim(v1, v2): 
     intersection = set(v1.keys()) & set(v2.keys()) 
     nume = sum([v1[x] * v2[x] for x in intersection]) 
     test1 = sum([v1[x]**2 for x in v1.keys()]) 
     test2 = sum([v2[x]**2 for x in v2.keys()]) 
     den = math.sqrt(test1) * math.sqrt(test2) 
     if not den: 
        return 0.0 
     else: 
        return float(nume) / den 
def vectorize_text(text): 
     words = WORD.findall(text) 
     return Counter(words) 

def avg_cosine_sim(line):
    tweets = line['tweets']
    sims = []
    for x in range(len(tweets) - 3):
        tweet = tweets[x]['text']
        tweet2 = tweets[x + 1]['text']
        tweet3 = tweets[x + 2]['text']
        tweet4 = tweets[x + 3]['text']

        v1 = vectorize_text(tweet)
        v2 = vectorize_text(tweet2)
        v3 = vectorize_text(tweet3)
        v4 = vectorize_text(tweet4)

        cosim = cosine_sim(v1, v2)
        cosim2 = cosine_sim(v1, v3)
        cosim3 = cosine_sim(v1, v4)
        
        cosim = max([cosim, cosim2, cosim3])

        sims.append(cosim)

    avg = np.mean(sims)

    return avg


def countlinks(line):
    count = 0
    try:
        for x in line['tweets']:

            try:
                tweet = x['entities']['urls']
                count += len(tweet)
            except:
                continue
        
    except:
        return count

    return count




def tweetvelocity(line):

    if len(line['tweets']) == 0:
        return 1

    count = 0
    arr = []
    for x in range(len(line['tweets']) - 1):

        t = pd.to_datetime(line['tweets'][x]["created_at"])
        t2 = pd.to_datetime(line['tweets'][x+1]["created_at"])

        v = t - t2

        arr.append(v.days)

    return np.mean(arr)
        

def tweetvelocity_2(line):

    if len(line['tweets']) == 0:
        return 200

    t = pd.to_datetime(line['tweets'][0]["created_at"])
    t2 = pd.to_datetime(line['tweets'][-1]["created_at"])  

    return (t - t2).days


def media_count(line):
    count = 0
    for tweet in line["tweets"]:
        try: 
            y = len(tweet["entities"]["media"])
            count += 1
        except:
            continue

    return count

def count_quote(line):
    count = 0
    for tweet in line['tweets']:
        if tweet["is_quote_status"]:
            count += 1

    return count

### 1.2.3. Collect data using the functions above and transform into a Pandas DataFrame

In [42]:
dfBotTweets = {'user_id':[],
               'retweet_total_ratio':[],
               'num_median_favorites':[],
               'num_of_tweets':[],
               'avg_tweet_length':[],
               'max_tweet_length':[],
               'std_tweet_length':[],
               'avg_cosine_sim': [],
               'link_count': [],
               'media': [],
               'quote_tweets': []
              }

i = 0

with gzip.open(f"{PATH_TO_DOWNLOADED}user_tweets.jsons.gz", "rb") as f:
    for line in f:

        line = json.loads(line)

        user_id = line['user_id']
        dfBotTweets['user_id'].append(user_id)
        
        retweet_total_ratio = get_retweet_tweet_ratio(line)
        dfBotTweets['retweet_total_ratio'].append(retweet_total_ratio)
        
        num_median_favorites = get_median_number_favorites(line)
        dfBotTweets['num_median_favorites'].append(num_median_favorites)
        
        dfBotTweets['num_of_tweets'].append(len(line['tweets']))
        
        avg_tweet_length = get_avg_length(line)
        dfBotTweets['avg_tweet_length'].append(avg_tweet_length)

        max_tweet_length = get_max_length(line)
        dfBotTweets['max_tweet_length'].append(max_tweet_length)

        std_tweet_length = get_std_length(line)
        dfBotTweets['std_tweet_length'].append(std_tweet_length)
        
        avg_cosine = avg_cosine_sim(line)
        dfBotTweets['avg_cosine_sim'].append(avg_cosine)

        link_count = countlinks(line)
        dfBotTweets['link_count'].append(link_count)

        media = media_count(line)
        dfBotTweets['media'].append(media)

        quote_tweets = count_quote(line)
        dfBotTweets['quote_tweets'].append(quote_tweets)

        i += 1
        if i % 1000 == 0:
            print(i)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000


In [43]:
dfBotTweets = pd.DataFrame(dfBotTweets)
dfBotTweets

Unnamed: 0,user_id,retweet_total_ratio,num_median_favorites,num_of_tweets,avg_tweet_length,max_tweet_length,std_tweet_length,avg_cosine_sim,link_count,media,quote_tweets
0,594642154,0.115000,2.0,200,87.340000,140,46.855249,0.121920,70,5,36
1,525600289,0.005025,1.0,199,49.351759,140,38.673961,0.140806,21,10,0
2,931895965501534209,0.900000,0.0,200,132.025000,140,22.288885,0.296732,10,9,3
3,1591543462746329088,0.185000,0.0,200,119.200000,140,30.044301,0.197845,84,15,12
4,734801354749796352,1.000000,0.0,200,122.155000,140,28.996223,0.141741,8,52,14
...,...,...,...,...,...,...,...,...,...,...,...
28310,1591370361488252928,0.800000,0.0,200,128.135000,140,18.832599,0.287320,24,0,2
28311,1475272459616235525,0.825000,0.0,200,119.580000,144,34.269864,0.179948,10,14,10
28312,1096753792731750401,0.051020,1.0,196,69.673469,140,39.701636,0.124059,42,10,30
28313,1269527617687953409,0.095000,2.0,200,51.130000,140,31.249370,0.051498,7,15,6


In [44]:
dfBotTweets.isna().sum()

user_id                   0
retweet_total_ratio      83
num_median_favorites     83
num_of_tweets             0
avg_tweet_length          0
max_tweet_length          0
std_tweet_length          0
avg_cosine_sim          190
link_count                0
media                     0
quote_tweets              0
dtype: int64

### 1.2.3. Merge dfBot and dfBotTweets

In [45]:
dfBotAll = dfBot.merge(dfBotTweets,
                       how='left')

#dfBotAll[['retweet_total_ratio', 'num_median_favorites']] = dfBotAll[['retweet_total_ratio', 'num_median_favorites']].fillna(0)
dfBotAll['num_median_favorites'] = dfBotAll['num_median_favorites'].fillna((dfBotAll['num_median_favorites'].mean()))
dfBotAll['retweet_total_ratio'] = dfBotAll['retweet_total_ratio'].fillna((dfBotAll['retweet_total_ratio'].mean()))
dfBotAll[['user_protected', 'user_verified','user_geo_enabled']] = dfBotAll[['user_protected', 'user_verified','user_geo_enabled']].astype(int)
dfBotAll

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_created_at,user_protected,user_verified,user_geo_enabled,since,user_tweet_count,user_fav_count,average_tweet,average_fav,count_digits,retweet_total_ratio,num_median_favorites,num_of_tweets,avg_tweet_length,max_tweet_length,std_tweet_length,avg_cosine_sim,link_count,media,quote_tweets
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260000,Fri Aug 27 13:07:30 +0000 2021,0,0,1,515,2551,17676,4.953398,34.322330,3,0.395939,0.0,197.0,72.126904,140.0,38.729298,0.120227,1.0,47.0,2.0
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,Fri Sep 11 08:45:44 +0000 2020,0,0,1,865,42771,15474,49.446243,17.889017,0,0.125000,0.0,200.0,39.605000,140.0,33.943025,0.089971,20.0,20.0,11.0
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,64,0.192308,Wed Apr 10 18:15:31 +0000 2019,0,0,0,1384,14300,18220,10.332370,13.164740,8,0.910000,0.0,200.0,121.570000,140.0,32.988712,0.251797,29.0,7.0,54.0
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,65,0.325203,Fri Jan 29 11:01:25 +0000 2016,0,0,0,2552,21303,26999,8.347571,10.579545,0,0.015306,1.0,196.0,83.020408,140.0,39.681379,0.191220,114.0,18.0,89.0
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,23,0.505051,Sun Dec 01 18:16:41 +0000 2013,0,0,0,3340,1629,2179,0.487725,0.652395,0,0.659898,0.0,197.0,119.421320,140.0,33.019455,0.161071,45.0,14.0,1.0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,0,0.513453,Mon Oct 26 21:08:22 +0000 2020,0,0,0,819,2396,10820,2.925519,13.211233,0,0.015000,1.0,200.0,70.435000,140.0,38.439378,0.159389,30.0,13.0,3.0
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,100,0.975088,Wed Feb 03 18:39:01 +0000 2010,0,0,1,4737,75178,36671,15.870382,7.741398,0,0.291457,2.0,199.0,88.246231,140.0,44.701856,0.187754,79.0,13.0,64.0
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,28,0.451362,Fri May 01 13:56:23 +0000 2009,0,0,0,5015,6482,7389,1.292522,1.473380,0,0.061538,0.0,195.0,42.805128,140.0,26.339265,0.174823,3.0,6.0,0.0
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,116,0.314431,Fri Mar 14 18:05:09 +0000 2014,0,0,1,3237,121113,140095,37.415199,43.279271,0,0.995000,0.0,200.0,129.175000,140.0,23.942731,0.185794,7.0,24.0,7.0


#### DF Preprocessings

In [46]:
dfPolitical[['user_protected', 'is_verified','user_geo_enabled']] = dfPolitical[['user_protected', 'is_verified','user_geo_enabled']].astype(int)

In [47]:
dfPolitical['is_verified'] = dfPolitical['is_verified'].astype(int)
dfPolitical['is_political_mention'] = dfPolitical['is_political_mention'].astype(int)
dfPolitical['is_political_reply'] = dfPolitical['is_political_reply'].astype(int)

In [48]:
dfBotAll['num_of_tweets'] = dfBotAll['num_of_tweets'].fillna((dfBotAll['num_of_tweets'].mean()))
dfBotAll['avg_tweet_length'] = dfBotAll['avg_tweet_length'].fillna((dfBotAll['avg_tweet_length'].mean()))
dfBotAll['max_tweet_length'] = dfBotAll['max_tweet_length'].fillna((dfBotAll['max_tweet_length'].mean()))
dfBotAll['std_tweet_length'] = dfBotAll['std_tweet_length'].fillna((dfBotAll['std_tweet_length'].mean()))
dfBotAll['avg_cosine_sim'] = dfBotAll['avg_cosine_sim'].fillna((dfBotAll['avg_cosine_sim'].mean()))
dfBotAll['link_count'] = dfBotAll['link_count'].fillna((dfBotAll['link_count'].mean()))
dfBotAll['media'] = dfBotAll['media'].fillna((dfBotAll['media'].mean()))
dfBotAll['quote_tweets'] = dfBotAll['quote_tweets'].fillna((dfBotAll['quote_tweets'].mean()))

# 2. TRAIN MODEL

## 2.1. Political Tweet Prediction

### 2.1.1. Merge dfPolitical data with labels

In [49]:
#dfPoliticalAll_train = dfPolitical.merge(dfBotTweets,on='user_id')

In [50]:
dfPoliticalAll_train = dfPolitical.merge(trainingTweetDf,
                                         on='tweet_id')

dfPoliticalAll_train.head()

Unnamed: 0,tweet_id,is_retweet,retweeted_username,is_reply,replied_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,num_political_entities,total_interactions,punctuation,tweet_length,is_verified,is_political_mention,description_entities,description_length,is_political_reply,is_political_retweet,avg_hash,user_followers_count,user_friends_count,user_created_at,user_protected,user_geo_enabled,user_tweet_count,user_fav_count,since,average_tweet,average_fav,count_digits,followers_to_all_ratio,count_digits_tweets,count_digits_desc,punctuation_desc,nonalpha,isPolitical
0,1588568792984346624,0,,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,1,147,3,140,0,0,1,140,0,0,0.0,284,539,Thu Oct 19 10:43:55 +0000 2017,0,1,2638,35673,1923,1.371815,18.550702,0,0.345079,1,0,0,21,Yes
1,1588568792984346624,0,,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,1,147,3,140,0,0,1,140,0,0,0.0,284,539,Thu Oct 19 10:43:55 +0000 2017,0,1,2638,35673,1923,1.371815,18.550702,0,0.345079,1,0,0,21,No
2,1588568792984346624,0,,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,1,147,3,140,0,0,1,140,0,0,0.0,284,539,Thu Oct 19 10:43:55 +0000 2017,0,1,2638,35673,1923,1.371815,18.550702,0,0.345079,1,0,0,21,Yes
3,1588568792984346624,0,,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,1,147,3,140,0,0,1,140,0,0,0.0,284,539,Thu Oct 19 10:43:55 +0000 2017,0,1,2638,35673,1923,1.371815,18.550702,0,0.345079,1,0,0,21,Yes
4,1588568792984346624,0,,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,1,147,3,140,0,0,1,140,0,0,0.0,284,539,Thu Oct 19 10:43:55 +0000 2017,0,1,2638,35673,1923,1.371815,18.550702,0,0.345079,1,0,0,21,No


In [51]:
dfPoliticalAll_train.describe()

Unnamed: 0,is_retweet,is_reply,num_mentions,num_hashtags,num_retweets,num_favorites,num_political_entities,total_interactions,punctuation,tweet_length,is_verified,is_political_mention,description_entities,description_length,is_political_reply,is_political_retweet,avg_hash,user_followers_count,user_friends_count,user_protected,user_geo_enabled,user_tweet_count,user_fav_count,since,average_tweet,average_fav,count_digits,followers_to_all_ratio,count_digits_tweets,count_digits_desc,punctuation_desc,nonalpha
count,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0,3994.0
mean,0.0,0.754382,1.296945,0.145719,1.449675,7.400351,2.212318,8.850025,1.51302,98.052328,0.018027,0.928893,0.690536,98.052328,0.248373,0.0,1.871514,13067.53,1088.970205,0.0,0.291187,23361.16,24542.250376,1519.605658,29.313688,26.254544,1.90686,0.416185,2.170255,0.392839,0.441662,19.150225
std,0.0,0.430507,1.427067,0.495204,12.823391,101.120743,2.350459,112.649101,1.083286,43.090991,0.133066,1.447798,1.621875,43.090991,0.432123,0.0,5.862578,148648.7,3051.679962,0.0,0.454366,64130.77,56144.150572,1434.882965,62.65599,56.156804,2.759014,0.22716,2.719651,1.621654,0.778323,9.714841
min,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,5.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,0.0,55.0,0.001313,0.0,0.0,0.0,0.0,0.0,0.0,0.0
25%,0.0,1.0,1.0,0.0,0.0,0.0,0.0,0.0,0.0,56.0,0.0,0.0,0.0,56.0,0.0,0.0,0.0,81.0,135.0,0.0,0.0,1934.75,1316.5,355.0,1.796231,1.616834,0.0,0.259386,0.0,0.0,0.0,11.0
50%,0.0,1.0,1.0,0.0,0.0,0.0,1.0,0.0,2.0,113.0,0.0,0.0,0.0,113.0,0.0,0.0,0.0,241.0,348.0,0.0,0.0,6243.0,6143.5,904.5,6.993229,7.268056,0.0,0.417224,1.0,0.0,0.0,20.0
75%,0.0,1.0,1.0,0.0,0.0,1.0,3.0,2.0,2.0,140.0,0.0,2.0,1.0,140.0,0.0,0.0,0.0,869.0,942.75,0.0,1.0,18720.5,21321.5,2508.75,27.457831,24.579415,2.0,0.516775,3.0,0.0,1.0,27.0
max,0.0,1.0,10.0,7.0,600.0,5722.0,14.0,6322.0,5.0,152.0,1.0,11.0,14.0,152.0,1.0,0.0,44.0,3509923.0,84879.0,0.0,1.0,1090423.0,789928.0,5790.0,1214.347692,899.307692,13.0,1.0,28.0,35.0,4.0,71.0


In [52]:
dfPoliticalAll_train.shape

(3994, 41)

### 2.1.2. Separate X and y values
We only use 3 features here to create a baseline model. However, it is not enough to get good results.

In [53]:
#X = dfPoliticalAll_train[['num_political_entities','total_interactions','num_hashtags']]
X = dfPoliticalAll_train.select_dtypes(exclude=['object'])
#X = dfPoliticalAll_train[['is_retweet','is_reply','num_mentions','num_hashtags','num_retweets','num_favorites','num_political_entities','total_interactions','punctuation','tweet_length','is_political_mention','is_political_reply','user_followers_count','user_friends_count','user_tweet_count','user_fav_count','since','average_tweet','average_fav','count_digits_tweets',  ]]
y = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

In [54]:
X_binary = X[['is_verified','is_political_reply','is_political_retweet','user_protected','user_geo_enabled']]
X_numerical = X.drop(['is_verified','is_political_reply','is_political_retweet','user_protected','user_geo_enabled'], axis=1)

In [55]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_numerical = ss.fit_transform(X_numerical)
X = np.concatenate((X_binary, X_numerical), axis=1)

In [56]:
X.shape

(3994, 32)

#### CFS

In [57]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# create the feature selector
selector = SelectKBest(score_func=mutual_info_classif, k=10)

# fit the feature selector on the data
selector.fit(X, y)

# get the selected features
X = selector.transform(X)

### 2.1.3. Train - validation split

In [58]:
from sklearn.model_selection import train_test_split

X_train, X_valid2, y_train, y_valid2 = train_test_split(X, y, test_size=0.20, random_state=42)

In [59]:
X_train.shape

(3195, 10)

### 2.1.4. Train the model

Here, you may use different models such as neural networks, XGBoost, AdaBoost, RandomForest, Linear Regression, Logistic Regression etc. to see which model does the best. Also, you can use grid_search_cv() or a basic for loop to optimize the hyperparameters of your model.

In [60]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

# create the model
model1 = Sequential()
model1.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(32, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(16, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(1, activation='sigmoid'))

# compile the model
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
model1.fit(X_train, y_train, epochs=69, batch_size=64)
preds = model1.predict(X_valid2)



Epoch 1/69
Epoch 2/69
Epoch 3/69
Epoch 4/69
Epoch 5/69
Epoch 6/69
Epoch 7/69
Epoch 8/69
Epoch 9/69
Epoch 10/69
Epoch 11/69
Epoch 12/69
Epoch 13/69
Epoch 14/69
Epoch 15/69
Epoch 16/69
Epoch 17/69
Epoch 18/69
Epoch 19/69
Epoch 20/69
Epoch 21/69
Epoch 22/69
Epoch 23/69
Epoch 24/69
Epoch 25/69
Epoch 26/69
Epoch 27/69
Epoch 28/69
Epoch 29/69
Epoch 30/69
Epoch 31/69
Epoch 32/69
Epoch 33/69
Epoch 34/69
Epoch 35/69
Epoch 36/69
Epoch 37/69
Epoch 38/69
Epoch 39/69
Epoch 40/69
Epoch 41/69
Epoch 42/69
Epoch 43/69
Epoch 44/69
Epoch 45/69
Epoch 46/69
Epoch 47/69
Epoch 48/69
Epoch 49/69
Epoch 50/69
Epoch 51/69
Epoch 52/69
Epoch 53/69
Epoch 54/69
Epoch 55/69
Epoch 56/69
Epoch 57/69
Epoch 58/69
Epoch 59/69
Epoch 60/69
Epoch 61/69
Epoch 62/69
Epoch 63/69
Epoch 64/69
Epoch 65/69
Epoch 66/69
Epoch 67/69
Epoch 68/69
Epoch 69/69


In [61]:
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

In [62]:
# from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
mse = mean_squared_error(y_valid2, preds)
mse

0.16188227106665506

In [63]:
max(preds)

array([0.99671113], dtype=float32)

In [64]:
min(preds)

array([0.01654552], dtype=float32)

In [66]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

# Initialize the models
ada = AdaBoostClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()
rf = RandomForestClassifier()

# Create the ensemble model
ensemble = VotingClassifier(estimators=[('ada', ada), ('xgb', xgb), ('lgb', lgb), ('rf', rf)], voting='soft')

# Fit the ensemble model on the training data
ensemble.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ensemble.predict(X_valid2)

# Print the accuracy
print("Accuracy:", ensemble.score(X_valid2, y_valid2))

Accuracy: 0.7647058823529411


In [67]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

# Initialize the models
ada = AdaBoostClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()
rf = RandomForestClassifier()

# Create the ensemble model
ensemble = VotingClassifier(estimators=[('ada', ada), ('xgb', xgb), ('lgb', lgb), ('rf', rf)], voting='soft')

# Define the base estimator (e.g. decision tree)
base_estimator = ensemble

# Define the bagging classifier
bagging_clf = BaggingClassifier(base_estimator=base_estimator, n_estimators=50, max_samples=0.8, max_features=0.8)

# Fit the bagging classifier to the training data
bagging_clf.fit(X_train, y_train)

# Predict on the test data
y_pred = bagging_clf.predict(X_valid2)
print("Accuracy:", bagging_clf.score(X_valid2, y_valid2))

Accuracy: 0.7847309136420526


## 2.2. Bot Detection

### 2.2.1. Merge dfBotAll data with labels

In [68]:
dfBotAll.user_screen_name = dfBotAll.user_screen_name.str.lower()

In [69]:
dfBotAll_train = dfBotAll.merge(trainingUserDf,
                               left_on='user_screen_name',
                               right_on='screen_name')

dfBotAll_train

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_created_at,user_protected,user_verified,user_geo_enabled,since,user_tweet_count,user_fav_count,average_tweet,average_fav,count_digits,retweet_total_ratio,num_median_favorites,num_of_tweets,avg_tweet_length,max_tweet_length,std_tweet_length,avg_cosine_sim,link_count,media,quote_tweets,screen_name,isBot
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260000,Fri Aug 27 13:07:30 +0000 2021,0,0,1,515,2551,17676,4.953398,34.322330,3,0.395939,0.0,197.0,72.126904,140.0,38.729298,0.120227,1.0,47.0,2.0,nasreenakhan006,No
1,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260000,Fri Aug 27 13:07:30 +0000 2021,0,0,1,515,2551,17676,4.953398,34.322330,3,0.395939,0.0,197.0,72.126904,140.0,38.729298,0.120227,1.0,47.0,2.0,nasreenakhan006,No
2,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,Fri Sep 11 08:45:44 +0000 2020,0,0,1,865,42771,15474,49.446243,17.889017,0,0.125000,0.0,200.0,39.605000,140.0,33.943025,0.089971,20.0,20.0,11.0,scorpiehoez,No
3,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,Fri Sep 11 08:45:44 +0000 2020,0,0,1,865,42771,15474,49.446243,17.889017,0,0.125000,0.0,200.0,39.605000,140.0,33.943025,0.089971,20.0,20.0,11.0,scorpiehoez,No
4,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,Fri Sep 11 08:45:44 +0000 2020,0,0,1,865,42771,15474,49.446243,17.889017,0,0.125000,0.0,200.0,39.605000,140.0,33.943025,0.089971,20.0,20.0,11.0,scorpiehoez,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3904,1286770207134973954,Hamide Arabacı,anka6054,,,151,61,0,0.712264,Fri Jul 24 21:08:34 +0000 2020,0,0,0,913,5288,5347,5.791895,5.856517,4,0.000000,1.0,200.0,120.190000,140.0,23.062825,0.878755,124.0,0.0,0.0,anka6054,No
3905,1598032338323214338,atamabekleyenbahceci,atamabekleyenzz,,,173,367,0,0.320370,Wed Nov 30 19:13:03 +0000 2022,0,0,0,54,311,196,5.759259,3.629630,0,0.580000,0.0,200.0,131.670000,140.0,18.445897,0.635663,84.0,0.0,4.0,atamabekleyenzz,No
3906,760235343966863360,Emrah İNCİ,memrahinci,Istanbul - Bayburt,Researcher | Middle East | Political Science |...,5863,5905,71,0.498215,Mon Aug 01 22:06:45 +0000 2016,0,0,1,2366,1029,725,0.434911,0.306424,0,0.040000,36.0,200.0,127.080000,144.0,30.592378,0.242883,161.0,30.0,3.0,memrahinci,No
3907,1553973684100124672,Murat Kkk,muratkkk18,,Normal sıradan bir insanım,1,10,26,0.090909,Mon Aug 01 05:19:56 +0000 2022,0,0,0,176,18,38,0.102273,0.215909,2,0.769231,0.0,13.0,115.307692,140.0,31.972066,0.171521,2.0,2.0,0.0,muratkkk18,No


In [70]:
trainingUserDf.isBot.value_counts()

No     3110
Yes     799
Name: isBot, dtype: int64

### 2.2.2. Separate X and y values
We use only 4 features here to create a baseline model. However, it is not enough to get good results.

In [71]:
#X = dfBotAll_train[['description_len', 'followers_to_all_ratio', 'retweet_total_ratio', 'num_median_favorites']]
X = dfBotAll_train.select_dtypes(exclude=['object'])
y = dfBotAll_train.isBot.apply(lambda x: 1 if x=='Yes' else 0)

In [72]:
X_binary = X[['user_verified','user_protected','user_geo_enabled']]
X_numerical = X.drop(['user_verified','user_protected','user_geo_enabled'], axis=1)

In [73]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_numerical = ss.fit_transform(X_numerical)
X = np.concatenate((X_binary, X_numerical), axis=1)

In [74]:
X.shape

(3909, 23)

#### CFS

In [75]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# create the feature selector
selector = SelectKBest(score_func=mutual_info_classif, k=18)
# fit the feature selector on the data
selector.fit(X, y)

# get the selected features
X = selector.transform(X)

### 2.2.3. Train-test split

In [76]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

In [77]:
X_train.shape

(3127, 18)

### 2.2.4. Train the model

In [78]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

# create the model
model2 = Sequential()
model2.add(Dense(64, input_dim=X_train.shape[1], activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(32, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(16, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(1, activation='sigmoid'))

# compile the model
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
model2.fit(X_train, y_train, epochs=69, batch_size=64)
preds = model2.predict(X_valid)

Epoch 1/69
Epoch 2/69
Epoch 3/69
Epoch 4/69
Epoch 5/69
Epoch 6/69
Epoch 7/69
Epoch 8/69
Epoch 9/69
Epoch 10/69
Epoch 11/69
Epoch 12/69
Epoch 13/69
Epoch 14/69
Epoch 15/69
Epoch 16/69
Epoch 17/69
Epoch 18/69
Epoch 19/69
Epoch 20/69
Epoch 21/69
Epoch 22/69
Epoch 23/69
Epoch 24/69
Epoch 25/69
Epoch 26/69
Epoch 27/69
Epoch 28/69
Epoch 29/69
Epoch 30/69
Epoch 31/69
Epoch 32/69
Epoch 33/69
Epoch 34/69
Epoch 35/69
Epoch 36/69
Epoch 37/69
Epoch 38/69
Epoch 39/69
Epoch 40/69
Epoch 41/69
Epoch 42/69
Epoch 43/69
Epoch 44/69
Epoch 45/69
Epoch 46/69
Epoch 47/69
Epoch 48/69
Epoch 49/69
Epoch 50/69
Epoch 51/69
Epoch 52/69
Epoch 53/69
Epoch 54/69
Epoch 55/69
Epoch 56/69
Epoch 57/69
Epoch 58/69
Epoch 59/69
Epoch 60/69
Epoch 61/69
Epoch 62/69
Epoch 63/69
Epoch 64/69
Epoch 65/69
Epoch 66/69
Epoch 67/69
Epoch 68/69
Epoch 69/69


In [79]:
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

In [80]:
# from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
mse = mean_squared_error(y_valid, preds)
mse

0.11315262060619571

In [81]:
max(preds)

array([0.9319312], dtype=float32)

In [82]:
min(preds)

array([2.541892e-12], dtype=float32)

In [83]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

# Initialize the models
ada = AdaBoostClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()
rf = RandomForestClassifier()

# Create the ensemble model
ensemble = VotingClassifier(estimators=[('ada', ada), ('xgb', xgb), ('lgb', lgb), ('rf', rf)], voting='soft')

# Fit the ensemble model on the training data
ensemble.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ensemble.predict(X_valid)

# Print the accuracy
print("Accuracy:", ensemble.score(X_valid, y_valid))

Accuracy: 0.8542199488491049


In [86]:
from keras.models import Sequential
from keras.layers import Dense, Dropout, Conv1D, MaxPooling1D, GlobalMaxPooling1D, BatchNormalization
from keras.callbacks import EarlyStopping
# Define the model architecture
model = Sequential()

# Add convolutional layers
model.add(Conv1D(filters=64, kernel_size=2, activation='relu', input_shape=(18, 1)))
model.add(BatchNormalization())
model.add(Conv1D(filters=64, kernel_size=2, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv1D(filters=128, kernel_size=2, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(filters=128, kernel_size=2, activation='relu'))
model.add(BatchNormalization())
model.add(MaxPooling1D(pool_size=2))
model.add(Dropout(0.2))

model.add(Conv1D(filters=256, kernel_size=2, activation='relu'))
model.add(BatchNormalization())
model.add(Conv1D(filters=256, kernel_size=2, activation='relu'))
model.add(BatchNormalization())
model.add(GlobalMaxPooling1D())
model.add(Dropout(0.2))

# Add dense layers
model.add(Dense(512, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))
model.add(Dense(256, activation='relu'))
model.add(BatchNormalization())
model.add(Dropout(0.2))

# Add output layer
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(optimizer='adam', loss='binary_crossentropy', metrics=['accuracy'])
early_stopping = EarlyStopping(monitor='val_loss', patience=10)

history = model.fit(X_train, y_train, batch_size=32, epochs=50, validation_split=0.2, callbacks=[early_stopping])

# Make predictions on the test set
y_pred = model.predict(X_valid)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50


In [87]:
from sklearn.ensemble import BaggingClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

# Initialize the models
ada = AdaBoostClassifier()
xgb = XGBClassifier()
lgb = LGBMClassifier()
rf = RandomForestClassifier()

# Create the ensemble model
ensemble = VotingClassifier(estimators=[('ada', ada), ('xgb', xgb), ('lgb', lgb), ('rf', rf)], voting='soft')

# Define the base estimator (e.g. decision tree)
base_estimator = ensemble

# Define the bagging classifier
bagging_clf = BaggingClassifier(base_estimator=base_estimator, n_estimators=100, max_samples=0.8, max_features=0.8)

# Fit the bagging classifier to the training data
bagging_clf.fit(X_train, y_train)

# Predict on the test data
y_pred = bagging_clf.predict(X_valid)
print("Accuracy:", bagging_clf.score(X_valid, y_valid))

Accuracy: 0.8529411764705882


# 3. MAKE PREDICTIONS

Here, you will make predictions with the models that you have trained above.

## 3.1. Predictions for Tweets (Political or Not)

In [88]:
# read the evaluation file as follows
evaluationTweetDf = pd.read_csv(DATA_PATH+'evaluation-round{}-tweet.csv'.format(ROUND), dtype={0: str}, header=None, names=['tweet_id'])
evaluationTweetDf = evaluationTweetDf.dropna()
evaluationTweetDf

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfPolitical_test = dfPolitical.merge(evaluationTweetDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X_pol = dfPolitical_test.select_dtypes(exclude=['object'])



In [89]:
#X = dfPoliticalAll_train[['num_political_entities','total_interactions','num_hashtags']]
Xtr_pol = dfPoliticalAll_train.select_dtypes(exclude=['object'])
ytr_pol = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

In [90]:
X_binary = Xtr_pol[['is_verified','is_political_reply','is_political_retweet','user_protected','user_geo_enabled']]
X_numerical = Xtr_pol.drop(['is_verified','is_political_reply','is_political_retweet','user_protected','user_geo_enabled'], axis=1)

X_binary_t = X_pol[['is_verified','is_political_reply','is_political_retweet','user_protected','user_geo_enabled']]
X_numerical_t = X_pol.drop(['is_verified','is_political_reply','is_political_retweet','user_protected','user_geo_enabled'], axis=1)

In [91]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_numerical = ss.fit_transform(X_numerical)
X_numerical_t = ss.fit_transform(X_numerical_t)

Xtr_pol = np.concatenate((X_binary, X_numerical), axis=1)
X_pol = np.concatenate((X_binary_t, X_numerical_t), axis=1)

In [92]:
Xtr_pol.shape

(3994, 32)

In [93]:
X_pol.shape

(5000, 32)

#### CFS

In [94]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# create the feature selector
selector = SelectKBest(score_func=mutual_info_classif, k=10)

# fit the feature selector on the data
selector.fit(Xtr_pol, ytr_pol)

# get the selected features
Xtr_pol = selector.transform(Xtr_pol)
X_pol = selector.transform(X_pol)

In [95]:
Xtr_pol.shape

(3994, 10)

In [96]:
X_train, X_valid5, y_train, y_valid5 = train_test_split(Xtr_pol, ytr_pol, test_size=0.20, random_state=42)

In [97]:
X_pol.shape

(5000, 10)

In [98]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

# create the model
model1 = Sequential()
model1.add(Dense(64, input_dim=10, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(32, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(16, activation='relu'))
model1.add(Dropout(0.2))
model1.add(Dense(1, activation='sigmoid'))

# compile the model
model1.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
model1.fit(Xtr_pol,ytr_pol, epochs=69, batch_size=64)

Epoch 1/69
Epoch 2/69
Epoch 3/69
Epoch 4/69
Epoch 5/69
Epoch 6/69
Epoch 7/69
Epoch 8/69
Epoch 9/69
Epoch 10/69
Epoch 11/69
Epoch 12/69
Epoch 13/69
Epoch 14/69
Epoch 15/69
Epoch 16/69
Epoch 17/69
Epoch 18/69
Epoch 19/69
Epoch 20/69
Epoch 21/69
Epoch 22/69
Epoch 23/69
Epoch 24/69
Epoch 25/69
Epoch 26/69
Epoch 27/69
Epoch 28/69
Epoch 29/69
Epoch 30/69
Epoch 31/69
Epoch 32/69
Epoch 33/69
Epoch 34/69
Epoch 35/69
Epoch 36/69
Epoch 37/69
Epoch 38/69
Epoch 39/69
Epoch 40/69
Epoch 41/69
Epoch 42/69
Epoch 43/69
Epoch 44/69
Epoch 45/69
Epoch 46/69
Epoch 47/69
Epoch 48/69
Epoch 49/69
Epoch 50/69
Epoch 51/69
Epoch 52/69
Epoch 53/69
Epoch 54/69
Epoch 55/69
Epoch 56/69
Epoch 57/69
Epoch 58/69
Epoch 59/69
Epoch 60/69
Epoch 61/69
Epoch 62/69
Epoch 63/69
Epoch 64/69
Epoch 65/69
Epoch 66/69
Epoch 67/69
Epoch 68/69
Epoch 69/69


<keras.callbacks.History at 0x1f248228d30>

In [99]:
# make predictions based on these variables
predictions_political = model1.predict(X_pol)



In [100]:
max(predictions_political)

array([0.98532045], dtype=float32)

In [101]:
predictions_political

array([[0.8681128 ],
       [0.09098553],
       [0.49456453],
       ...,
       [0.01340256],
       [0.8915803 ],
       [0.7660928 ]], dtype=float32)

In [102]:
min(predictions_political)

array([0.00189104], dtype=float32)

In [103]:
max(predictions_political)

array([0.98532045], dtype=float32)

### This part is important! We expect you to return your predictions in the following format:

In [104]:
modelPredTweet = dict([(x,float(y)) for x,y in zip([*dfPolitical_test.tweet_id], predictions_political)])
modelPredTweet

{'1593649159009099777': 0.8681128025054932,
 '1434787703783051264': 0.09098552912473679,
 '1427339600083791885': 0.4945645332336426,
 '1388235183653011462': 0.015061851590871811,
 '1579558096833511424': 0.8111414909362793,
 '1592120408073203712': 0.805652916431427,
 '1439547067337256967': 0.02052333950996399,
 '1577024342268837888': 0.8770454525947571,
 '1365726640416976906': 0.10726945847272873,
 '1597274845381029888': 0.8737208843231201,
 '1597512443328167936': 0.7868518829345703,
 '1415032260571680768': 0.2593359351158142,
 '1583477966373543936': 0.9250688552856445,
 '1564926450096013313': 0.3334864675998688,
 '1597138789108895744': 0.8804449439048767,
 '1384499047390658560': 0.48571860790252686,
 '1596583748669419521': 0.8398062586784363,
 '1391681495622995971': 0.05582781508564949,
 '1365710259549966339': 0.3042089343070984,
 '1590673118397624323': 0.057204313576221466,
 '1597256187325878273': 0.2615653872489929,
 '1588464678443024385': 0.8761496543884277,
 '1413108476348354562': 

## 3.2. Predictions for Users (Bot or Not)

In [105]:
evaluationUserDf = pd.read_csv(DATA_PATH+'evaluation-round{}-user.csv'.format(ROUND), dtype={0: str}, header=None, names=['user_screen_name'])
evaluationUserDf = evaluationUserDf.dropna()

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfBot_test = dfBotAll.merge(evaluationUserDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X_bot = dfBot_test.select_dtypes(exclude=['object'])



In [106]:
#X = dfBotAll_train[['description_len', 'followers_to_all_ratio', 'retweet_total_ratio', 'num_median_favorites']]
Xtr_bot = dfBotAll_train.select_dtypes(exclude=['object'])
ytr_bot = dfBotAll_train.isBot.apply(lambda x: 1 if x=='Yes' else 0)

In [107]:
X_binary = Xtr_bot[['user_verified','user_protected','user_geo_enabled']]
X_numerical = Xtr_bot.drop(['user_verified','user_protected','user_geo_enabled'], axis=1)

X_binary_t = X_bot[['user_verified','user_protected','user_geo_enabled']]
X_numerical_t = X_bot.drop(['user_verified','user_protected','user_geo_enabled'], axis=1)



In [108]:
from sklearn.preprocessing import StandardScaler

ss = StandardScaler()
X_numerical = ss.fit_transform(X_numerical)
Xtr_bot = np.concatenate((X_binary, X_numerical), axis=1)

X_numerical_t = ss.fit_transform(X_numerical_t)
X_bot = np.concatenate((X_binary_t, X_numerical_t), axis=1)

In [109]:
Xtr_bot.shape

(3909, 23)

In [110]:
X_bot.shape

(4999, 23)

#### CFS

In [111]:
from sklearn.feature_selection import SelectKBest, mutual_info_classif

# create the feature selector
selector = SelectKBest(score_func=mutual_info_classif, k=18)
# fit the feature selector on the data
selector.fit(Xtr_bot, ytr_bot)

# get the selected features
Xtr_bot = selector.transform(Xtr_bot)

X_bot = selector.transform(X_bot)

In [112]:
X_train, X_valid6, y_train, y_valid6 = train_test_split(Xtr_bot, ytr_bot, test_size=0.20, random_state=42)

In [113]:
Xtr_bot.shape

(3909, 18)

In [114]:
X_bot.shape

(4999, 18)

In [115]:
from keras.models import Sequential
from keras.layers import Dense, Dropout

# create the model
model2 = Sequential()
model2.add(Dense(64, input_dim=18, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(32, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(16, activation='relu'))
model2.add(Dropout(0.2))
model2.add(Dense(1, activation='sigmoid'))

# compile the model
model2.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# train the model
model2.fit(Xtr_bot,ytr_bot, epochs=69, batch_size=64)

Epoch 1/69
Epoch 2/69
Epoch 3/69
Epoch 4/69
Epoch 5/69
Epoch 6/69
Epoch 7/69
Epoch 8/69
Epoch 9/69
Epoch 10/69
Epoch 11/69
Epoch 12/69
Epoch 13/69
Epoch 14/69
Epoch 15/69
Epoch 16/69
Epoch 17/69
Epoch 18/69
Epoch 19/69
Epoch 20/69
Epoch 21/69
Epoch 22/69
Epoch 23/69
Epoch 24/69
Epoch 25/69
Epoch 26/69
Epoch 27/69
Epoch 28/69
Epoch 29/69
Epoch 30/69
Epoch 31/69
Epoch 32/69
Epoch 33/69
Epoch 34/69
Epoch 35/69
Epoch 36/69
Epoch 37/69
Epoch 38/69
Epoch 39/69
Epoch 40/69
Epoch 41/69
Epoch 42/69
Epoch 43/69
Epoch 44/69
Epoch 45/69
Epoch 46/69
Epoch 47/69
Epoch 48/69
Epoch 49/69
Epoch 50/69
Epoch 51/69
Epoch 52/69
Epoch 53/69
Epoch 54/69
Epoch 55/69
Epoch 56/69
Epoch 57/69
Epoch 58/69
Epoch 59/69
Epoch 60/69
Epoch 61/69
Epoch 62/69
Epoch 63/69
Epoch 64/69
Epoch 65/69
Epoch 66/69
Epoch 67/69
Epoch 68/69
Epoch 69/69


<keras.callbacks.History at 0x1f24a4da220>

In [116]:
predictions_bot = model2.predict(X_bot)



In [117]:
predictions_bot.shape

(4999, 1)

In [118]:
predictions_bot.sum()

1032.2997

In [119]:
max(predictions_bot)

array([0.9755126], dtype=float32)

In [120]:
min(predictions_bot)

array([2.0818535e-20], dtype=float32)

In [121]:
modelPredUser = dict([(x,float(y)) for x,y in zip([*dfBot_test.user_screen_name], predictions_bot)])
#modelPredUser

# PREPARE SUBMISSION

You will need to submit exact same file produced by using the following code. Any deviation from the desired format willbe marked as 0.

In [122]:
# Explain your approach

data_explanations = '''
To strengthen our training data, we concatenated all annotation files (from each group member) with the main training datasets.
Since the given prediction test dataset can not be verified (no labels provided), we divided our dataset into two subsets of 80% allocated for training and 20% set aside for testing, in order to evaluate the performance of our machine learning models.
Just Before the prediction, we trained our models with all of the labeled data we have and predicted to evaluation data.
'''

feature_explanations = '''
For feature extraction process, we discussed our annotation methodologies for both isPolitical and isBot parts.
Based on our common approaches, we selected the most useful features which were not included in the base template model.

created_at = the account's creation date.
since: it refers to the time passed from date that the account has been created 
user_geo_enabled : it is 1 if the user enabled his location.
user_verified : it refers to whether the account has verified by Twitter or not. 
count_digits : It refers to the amount of digits that exist in a username.
user_protected: It is 1 if the user has a private account.
user_followers_count: the number of followers that a user has.
user_friends_count: the number of people that the user is following.
description_len: the number of characters of the bio (description) of the user. 
user_tweet_count: the number of tweets that the user tweeted.
user_fav_count: the number of tweets that the user favorited.
average_tweet: the average amount of tweet that user tweeted since he joined to Twitter.
average_fav: the average amount of tweets that user favorited since he joined to Twitter.
followers_to_all_ratio: followers / (followers + friends)
is_reply: it indicates whether the tweet is a reply to another tweet.
replied_username: the username of the tweet which is replied.
tweet_length: the lenght of the tweet.
punctuation: the number of punctuation marks.

and additionally, we added more keywords to the political entities list.


Then, for round 2 we extracted more feature and at total isPolitical set has 32 and isBot set has 23 features.
We only used numerical and boolean variables. Numeric features are scaled with standardscaler.
Best 10 features are selected for political and 18 features are selected for bot.

'''

model_explanations = '''
In our modelling we had used following modelling methodologies and compared their Accuracy and MSE scores to determine best model.
GridSearch Cross Validation is applied to XGBoost, AdaBoost, DecisionTree to find best hyperparameter values.
Standard scaling is applied to all numerical values (normalization). In Round2, we tested voting classifier with
XGBoost, AdaBoost, LGBM, RandomForest and NN (Keras Binary Classifier) and obtained better results. However, we decided to
use NN directly since it can give us the highest MSE score due to its continuous outputs. We used NN for bot
classification tasks. Depending on the result of Round 2, we can change the political part into a NLP model.

Here our tested Models 
    Decision Tree
    Random Forest
    Logistic Regression
    XGBoost 
    AdaBoost 
    LGBM 
    NLP-BERT 
    KNN (K-Nearest Neighbors)
    Stacking Classifier
    Soft Voting Classifier
    Bagging Classifier
    Gaussian Naive Bayes 
    CNN
    NN
    
We are planning to utilize NPL-BERT model (NLP, encoded-base tranformer model-BERT) for isPolitical part in the next round,
and CNN for the isBot part.

'''

additional_explanations = '''
It seems that improving the quality and the quantity of the political entity list, increases the accuracy rate we have obtained significantly.
Also the adding the annotation files to the training set is something we have brain-stormed. We detected and generated many useful features
howerer as we add more and more feature to our training data, mse has increased. Therefore we decided to move on with bestfeatures
feature selection method. 
'''

In [123]:
ROUND = 2 # This project will have 3 rounds of predictions: 1,2,3
STUDENT_ID = '26772'#'<insert-your-id-here>'

In [124]:
predictions = {
    'round': ROUND,
    'student_id': STUDENT_ID,
    'user_predictions': modelPredUser,
    'tweet_predictions': modelPredTweet,
    'explanations': {
        'data': data_explanations,
        'feature': feature_explanations,
        'model': model_explanations,
        'other': additional_explanations,
    }
}


with open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'w') as fl:
    fl.write(json.dumps(predictions, indent=4))

In [125]:
# Test your submission file

submission = json.load(open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'r'))
submission

{'round': 2,
 'student_id': '26772',
 'user_predictions': {'nedenburdaysam': 0.0016010833205655217,
  'biologselim': 0.17713484168052673,
  'bilgin21604923': 0.05248625949025154,
  'denizlihabercom': 0.03314055874943733,
  'burakerbaychp': 0.0074861375615000725,
  'mustafaarst': 0.10217925161123276,
  'mvnez': 0.029669441282749176,
  'farukhalit2': 0.009846550412476063,
  'harlunoshi': 0.004284149967133999,
  'tamerduran_1': 0.16477464139461517,
  'donkisotumsu': 0.009929539635777473,
  'enveraysevera': 0.007127429824322462,
  'gendenmukatol': 0.0038403216749429703,
  '1905anason': 0.16777193546295166,
  'dasiskein': 0.09350070357322693,
  'ercan_bas29': 0.13088040053844452,
  'mett_1907': 0.010624295100569725,
  'haberinyokcokk': 0.02518833987414837,
  'han34nesli': 0.2610599100589752,
  'mehmetaltay64': 0.0029074496123939753,
  'nurtencam2': 0.04646916314959526,
  'berkeduranovic': 0.029155183583498,
  'cagdasadim': 0.01035254169255495,
  'yorumsuzadam87': 0.14933739602565765,
  'twi

In [126]:
preds = model2.predict(X_valid6)



In [127]:
# from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
mse = mean_squared_error(y_valid6, preds)
mse

0.0874019753959514

In [128]:
preds2 = model1.predict(X_valid5)



In [129]:
# from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error
mse = mean_squared_error(y_valid5, preds2)
mse

0.14373696868524807