# 0. Initialize

## 0.1. Import Libraries

In [1]:
%matplotlib inline
%load_ext autoreload
%autoreload 2
import os, sys, glob
import gzip
import random
import tqdm
import json
import numpy as np
import pandas as pd
pd.set_option("display.max_columns", None)

from IPython import display
import matplotlib as mpl
from matplotlib import pyplot as plt

## 0.2. DEFINE VARIABLES 

In [2]:
DATA_PATH = '/Users/Gamegaraj/Desktop/CS412/data/' # '<insert-your-training-data-path-here>'

ROUND = 1 # This project will have 3 rounds of predictions: 1,2,3
STUDENT_ID = '26772'#'<insert-your-id-here>'
PROJECT_CODE = 'CS4129a709ea5dfc4'#'<insert-your-code-here>' # Same code for the annotation eg. CS412xxxxx

## 0.3. Read Training & Evaluation Data

### 0.3.1. Get the labels for tweets

In [3]:
#trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH))
trainingTweetDf = pd.read_csv('{}training-tweet.csv'.format(DATA_PATH), dtype={'tweet_id': str, 'isPolitical': str})
trainingTweetDf

Unnamed: 0,tweet_id,isPolitical
0,1597170281545551872,Yes
1,1431700027471192069,No
2,1566035577090281472,Yes
3,1591538690869940225,Yes
4,1583898169238167554,Yes
...,...,...
2995,1593539327623151619,Yes
2996,1393886554062524418,No
2997,1597925615092764672,Yes
2998,1585291418616176640,Yes


In [4]:
trainingTweetDf.isPolitical.value_counts()

Yes    2003
No      997
Name: isPolitical, dtype: int64

### 0.3.2. Get the labels for users

In [5]:
trainingUserDf = pd.read_csv('{}training-user.csv'.format(DATA_PATH))
#trainingUserDf = pd.read_csv('training-user.csv')
trainingUserDf

Unnamed: 0,screen_name,isBot
0,koftecancaddy,No
1,ahaber,No
2,selahat03949652,No
3,erdin06357062,No
4,bhct__necatii,No
...,...,...
2995,djblumenberg,No
2996,mel1sq,No
2997,eren_yz1,Yes
2998,ergnyildiz4,No


### 0.3.3. Expand your dataset with metadata and tweets

In [6]:
# You can also expand training data by downloading your own labeled datasets following the link
# Download the documents under "Link to training data"

#print('http://www.onurvarol.com/Annotation-CS412-202201/reports/report_{}.html'.format(PROJECT_CODE))
print('http://www.onurvarol.com/Annotation-CS412-202201/reports/report_{}.html'.format('CS412aa4c69f55b37'))

http://www.onurvarol.com/Annotation-CS412-202201/reports/report_CS412aa4c69f55b37.html


### Add Annotation Data

In [7]:
codes = ['CS4129a709ea5dfc4','CS412a32e72d94b5f','CS412c255f188f1f1','CS4125691a2d1c16d','CS412aa4c69f55b37']

def mergeAnnotations(codes):
    df_users = pd.DataFrame()
    for i in codes:
        filename = 'annotated_users_{}.csv'.format(i)
        filename = ('{}'+filename)
        df = pd.read_csv(filename.format(DATA_PATH))
        df = df.drop(['url', 'isOrganizational', 'isTroll','gender'], axis=1)
        df = df.drop(df[df.isBot == 'Not sure'].index)
        df.rename(columns = {'Unnamed: 0':'screen_name'}, inplace = True)
        df.dropna(inplace=True)
        df_users = pd.concat([df,df_users])
    

    df_tweets = pd.DataFrame()
    for i in codes:
        filename = 'annotated_tweets_{}.csv'.format(i)
        filename = ('{}'+filename)
        df = pd.read_csv(filename.format(DATA_PATH))
        df = df.drop(['url', 'sentiment', 'isExperiential','isInsult','topics'], axis=1)
        df.rename(columns = {'Unnamed: 0':'tweet_id'}, inplace = True)
        df.dropna(inplace=True)
        df_tweets = pd.concat([df,df_tweets])  
        df_tweets['tweet_id'] = df_tweets['tweet_id'].astype(str)
    
    return df_users, df_tweets
        
        

In [8]:
df_users, df_tweets = mergeAnnotations(codes)

In [9]:
trainingUserDf = pd.concat([trainingUserDf,df_users],ignore_index=True)
trainingTweetDf = pd.concat([trainingTweetDf,df_tweets],ignore_index=True)

# 1. EXTRACT FEATURES
Under *1.1. Political Tweet Detection* and *1.2. Bot Detection*, we firstly collect raw data for processing. We then combine some of them (total_interactions = num_favorites + num_retweets) or use them to extract features (whether the tweet has one of the political entities @meralaksener, @kilicdarogluk etc.).

We expect you to collect more raw data from **tweet_metadata**, **user_profiles** and **user_tweets** files by creating a function as shown in below examples such as *check_if_retweet()* and using it while iterating over data as shown under *Merge Collected Features*.

We also expect you to create new variables as much as you can from the data in order to make your predictions more accurate. For example, you may want to check:

- The tweet sources that a user frequently uses
- Whether the user is a verified account or not

...

to assess whether **a user is a bot or not** and whether **a tweet is political or not**.

### Read json files

In [10]:
PATH_TO_DOWNLOADED = DATA_PATH # 'D:/Users/suuser/Desktop/Sabancı/CS412/spring-2022/project/'

## 1.1. Political Tweet Detection
This part stands for the feature extraction of tweets. We start with collecting the raw data from *tweet_metadata*, then use some of them to extract features.

### 1.1.1. Get Raw Data

#### 1.1.1.1. Check if Retweet

In [11]:
def check_if_retweet(tweet_metadata_line):
    is_retweet = 0
    retweeted_username = None

    try:
        tweet_metadata_line['retweeted_status']
        retweeted_username = tweet_metadata_line['retweeted_status']['user']['screen_name'].lower()
        is_retweet = 1

    except KeyError:
        pass

    return is_retweet, retweeted_username

#### 1.1.1.1. Check if Reply

In [12]:
def check_if_reply(tweet_metadata_line):
    is_reply = 0
    replied_username = None

    
    if(tweet_metadata_line['in_reply_to_screen_name'] != None ):
        replied_username = tweet_metadata_line['in_reply_to_screen_name'].lower()
        is_reply = 1



    return is_reply, replied_username

#### 1.1.1.1. Check if Verified

In [13]:
def check_if_verified(tweet_metadata_line):
     
    is_verified = tweet_metadata_line['user']['verified']  
    return is_verified

#### 1.1.1.2. Get Tweet Text

In [14]:
def get_tweet_text(tweet_metadata_line):
    text = tweet_metadata_line['text']
    
    return text

#### 1.1.1.3. Get Tweet ID

In [15]:
def get_tweet_id(tweet_metadata_line):
    id_str = tweet_metadata_line['id_str']
    
    return id_str

#### 1.1.1.4. Get Number of Mentions and Hashtags

In [16]:
def get_number_mentions_hashtags(tweet_metadata_line):
    num_mentions = len(tweet_metadata_line['entities']['user_mentions'])
    num_hashtags = len(tweet_metadata_line['entities']['hashtags'])

    return num_mentions, num_hashtags

#### 1.1.1.5. Get Number of Retweets and Favorites

In [17]:
def get_number_retweets_favorites(tweet_metadata_line):
    retweet_count = tweet_metadata_line['retweet_count']
    favorite_count = tweet_metadata_line['favorite_count']
    
    return retweet_count, favorite_count

#### 1.1.1.5. Get Number of Punctuations

In [18]:
def get_punc_num(text):
    punc = ['.', ',', '!', '?', ':', ';']
    count = 0
    for x in punc:
        if x in text:
            count += 1
    return count

#### 1.1.1.6. Get User Info

In [19]:
def get_user_info(tweet_metadata_line):
    id = tweet_metadata_line['user']['id_str']
    screen_name = tweet_metadata_line['user']['screen_name'].lower()
    description = tweet_metadata_line['user']['description']

    return id, screen_name, description

### 1.1.2. Derive Manually Crafted Features

#### 1.1.2.1. Check for political entity in text

 ['meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag','vedatbilgn','SavciSayan','fahrettinaltun','erdogan','devlet bahçeli','bahçeli','ülkü'
                        'tcbestepe','fuatoktay','suleymansoylu','ikalin1','ekrem_imamoglu','mansuryavas06','pkk','recep','tayyip','erdoğan','cb','#ak','cumhuriyet'
                        'hdp','terörist','chpkk','kadro','ek','tcbestepe','MevlutCavusoglu', 'ozdag', 'özdağ', 'TBMM','drfahrettinkoca', 'yenisafak', 'tayyip', 'cumhur', 'belediye', 'baskan', 'başkan', 'ulusal',
                        'odatv', 'suleyman', 'haskologlu', 'mansur', 'dbdevletbahceli', 'Ahmet_Davutoglu', 'babacan', 'gazetesozcu', 'imamoglu', 'imamoğlu', 'parlament', 'meclis', 
                        'savaş', 'eğitim', 'egitim', 'dolar', 'lira', 'enflasyon', 'euro', 'döviz', 'altın', 'benzin', 'atama', 'altılı masa', 'abd', 'avrupa', 'almanya', 'nato',
                        'sınır', 'göçmen', 'gocmen', 'sığınmacı', 'mülteci', 'mahkeme', 'kanun', 'ukrayna', 'rusya', 'komisyon', 'fetö', 'faiz', 'piyasa', 'banka', 'politik', 'toplantı', 'çiftçi',
                        'saray', 'demokrasi', 'faşist', 'kemal', 'rejim', 'özgürlük', 'koalisyon', 'egemen', 'kurultay', 'danıştay', 'davutoğlu', 'birleşmiş milletler',
                        'bahçeli', 'diplomasi', 'cem uzan', 'lgbt', 'seçim', '2023', 'ibb', 'cemaat', 'soylu', 'liberal', 'kapital', 'protesto', 'halk','seçim', 
                        'erken seçim','sandık','icraat','gençlik kolu','akp','chp','mhp', 'kayyum','anayasa','mahkeme','nebati','tcmb','merkez bankası','kılışdar','cumhurbaşkanı adayı','millet','zillet'
                        'rte','atama','EYT','atanamıyor','kanun','soruşturma','sorusturma','altılı masa','bakanım','af','genel af', 'skandal','yolsuzluk','ihale','zafer','genelaf','muhalefet','muhalif','bakan'
                       'meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag','vedatbilgn','SavciSayan','fahrettinaltun',
                        'tcbestepe','fuatoktay','suleymansoylu','ikalin1','ekrem_imamoglu','mansuryavas06','murat_kurum',
                       'MevlutCavusoglu', 'drfahrettinkoca', 'NureddinNebati','akaraismailoglu','DIBAliErbas','ismailcatakli',
                    'dbdevletbahceli', 'Ahmet_Davutoglu','deryayanikashb','suleymansoylu','06melihgokcek','mustafasentop','VahitKirisci',
                     'fuatoktay','Akparti','emineerdoğan','iletisim','bybekirbozdag','omerrcelik','kasapoglu']

In [20]:
def check_political_ent(text):
    
    # the list below can be modified and some new names may be added (or removed)
    list_of_entities  = ['meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag','vedatbilgn','SavciSayan','fahrettinaltun','erdogan','devlet bahçeli','bahçeli','ülkü'
                        'tcbestepe','fuatoktay','suleymansoylu','ikalin1','ekrem_imamoglu','mansuryavas06','pkk','recep','tayyip','erdoğan','cb','#ak','cumhuriyet'
                        'hdp','terörist','chpkk','kadro','ek','tcbestepe','MevlutCavusoglu', 'ozdag', 'özdağ', 'TBMM','drfahrettinkoca', 'yenisafak', 'tayyip', 'cumhur', 'belediye', 'baskan', 'başkan', 'ulusal',
                        'odatv', 'suleyman', 'haskologlu', 'mansur', 'dbdevletbahceli', 'Ahmet_Davutoglu', 'babacan', 'gazetesozcu', 'imamoglu', 'imamoğlu', 'parlament', 'meclis', 
                        'savaş', 'eğitim', 'egitim', 'dolar', 'lira', 'enflasyon', 'euro', 'döviz', 'altın', 'benzin', 'atama', 'altılı masa', 'abd', 'avrupa', 'almanya', 'nato',
                        'sınır', 'göçmen', 'gocmen', 'sığınmacı', 'mülteci', 'mahkeme', 'kanun', 'ukrayna', 'rusya', 'komisyon', 'fetö', 'faiz', 'piyasa', 'banka', 'politik', 'toplantı', 'çiftçi',
                        'saray', 'demokrasi', 'faşist', 'kemal', 'rejim', 'özgürlük', 'koalisyon', 'egemen', 'kurultay', 'danıştay', 'davutoğlu', 'birleşmiş milletler',
                        'bahçeli', 'diplomasi', 'cem uzan', 'lgbt', 'seçim', '2023', 'ibb', 'cemaat', 'soylu', 'liberal', 'kapital', 'protesto', 'halk','seçim', 
                        'erken seçim','sandık','icraat','gençlik kolu','akp','chp','mhp', 'kayyum','anayasa','mahkeme','nebati','tcmb','merkez bankası','kılışdar','cumhurbaşkanı adayı','millet','zillet'
                        'rte','atama','EYT','atanamıyor','kanun','soruşturma','sorusturma','altılı masa','bakanım','af','genel af', 'skandal','yolsuzluk','ihale','zafer','genelaf','muhalefet','muhalif','bakan'
                       'meral_aksener', 'kilicdarogluk', 'vekilince', 'RTErdogan', 'MevlutCavusoglu', 'umitozdag','vedatbilgn','SavciSayan','fahrettinaltun',
                        'tcbestepe','fuatoktay','suleymansoylu','ikalin1','ekrem_imamoglu','mansuryavas06','murat_kurum',
                       'MevlutCavusoglu', 'drfahrettinkoca', 'NureddinNebati','akaraismailoglu','DIBAliErbas','ismailcatakli',
                    'dbdevletbahceli', 'Ahmet_Davutoglu','deryayanikashb','suleymansoylu','06melihgokcek','mustafasentop','VahitKirisci',
                     'fuatoktay','Akparti','emineerdoğan','iletisim','bybekirbozdag','omerrcelik','kasapoglu']
    entities_in_text = [ent for ent in list_of_entities if ent.lower() in text.lower()]
    number_entities = len(entities_in_text)

    return number_entities

#### 1.1.2.2. Number of total interactions

In [21]:
def total_interactions(retweet_count, favorite_count):
    total_num_interactions = retweet_count + favorite_count
    
    return total_num_interactions

### 1.1.2. Collect data using the functions above and transform into a Pandas DataFrame

In [22]:
dfPolitical = {'tweet_id':[],
              'is_retweet':[],
              'retweeted_username':[],
               'is_reply':[],
               'replied_username':[],
              'text':[],
              'num_mentions':[],
              'num_hashtags':[],
              'num_retweets':[],
              'num_favorites':[],
              'user_id':[],
              'user_screen_name':[],
              'user_description':[],
              'num_political_entities':[],
              'total_interactions':[],
              'punctuation': [],
              'tweet_length' : [],
              'is_verified': []}


with gzip.open(f"{PATH_TO_DOWNLOADED}tweet_metadata.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)
        
        # raw data:
        id_str = get_tweet_id(line)
        is_retweet, retweeted_username = check_if_retweet(line)
        is_reply, replied_username = check_if_reply(line)
        text = get_tweet_text(line)
        num_mentions, num_hashtags = get_number_mentions_hashtags(line)
        retweet_count, favorite_count = get_number_retweets_favorites(line)
        user_id_str, screen_name, user_description = get_user_info(line)
        punctuation = get_punc_num(text)
        is_verified = check_if_verified(line)
        tweet_length = len(text)
        # manually crafted data:
        num_political_entities = check_political_ent(text)
        total_num_interactions = total_interactions(retweet_count, favorite_count)

        dfPolitical['tweet_id'].append(id_str)
        dfPolitical['is_retweet'].append(is_retweet)
        dfPolitical['retweeted_username'].append(retweeted_username)
        dfPolitical['is_reply'].append(is_reply)
        dfPolitical['replied_username'].append(replied_username)
        dfPolitical['text'].append(text)
        dfPolitical['num_mentions'].append(num_mentions)
        dfPolitical['num_hashtags'].append(num_hashtags)
        dfPolitical['num_retweets'].append(retweet_count)
        dfPolitical['num_favorites'].append(favorite_count)
        dfPolitical['user_id'].append(user_id_str)
        dfPolitical['user_screen_name'].append(screen_name)
        dfPolitical['user_description'].append(user_description)
        dfPolitical['num_political_entities'].append(num_political_entities)
        dfPolitical['total_interactions'].append(total_num_interactions)
        dfPolitical['punctuation'].append(punctuation)
        dfPolitical['tweet_length'].append(tweet_length)
        dfPolitical['is_verified'].append(is_verified)

In [23]:
dfPolitical = pd.DataFrame(dfPolitical)
dfPolitical

Unnamed: 0,tweet_id,is_retweet,retweeted_username,is_reply,replied_username,text,num_mentions,num_hashtags,num_retweets,num_favorites,user_id,user_screen_name,user_description,num_political_entities,total_interactions,punctuation,tweet_length,is_verified
0,1588568792984346624,0,,0,,"Sosyal Hizmetin temelini çocuk oluşturur,çocuğ...",0,0,49,98,920963718103650304,maviruh_,shu/\nburaya afilli bir söz yazdığımı varsayın,1,147,3,140,False
1,1588452263047069697,0,,1,mahirunal,"@mahirunal Gavur İzmir ya onlar, hani Cumhuriy...",1,0,0,0,595514060,mtfdan,,1,0,2,97,False
2,1569589330544398336,0,,0,,#ŞehitAdayıUzmÇvşaKadro\nSiz İstesenizde Istem...,0,1,0,0,1356375754561490947,ahsucilginuzman,Vatan Sevdalisi,3,0,3,140,False
3,1570428119609139201,0,,1,ajans_muhbir,@ajans_muhbir Siz kaypak olmayıp onay vermesey...,1,0,0,0,1478775431008595968,hamitelkelle,HighOne,1,0,2,140,False
4,1551163840368414722,0,,0,,Engelli öğretmenler olarak önümüzdeki engeller...,0,0,0,0,1511976696337113088,sed58417690,,1,0,3,140,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
33528,1568595408233832448,0,,0,,Gerçek kimlik taşımayan hesaplara cevap vermem...,0,0,9,81,576247173,ardanzenturk,RT ONAYLADIĞIM ANLAMINA GELMEZ\nArtık fikirler...,1,90,3,140,False
33529,1584027427696959488,0,,1,umitozdag,@umitozdag Neden Suriyelilerle ilgili bu kadar...,1,0,1,8,162308585,ozgul_61,Bridge design engineer Yaay hesabı : dilfiruz,3,9,2,140,False
33530,1585945783307730945,0,,1,celebimehmeta,@celebimehmeta Niye Türkiye yüzyılıda.Türkiye ...,1,0,0,1,415025519,ladrekova,,1,1,2,76,False
33531,1569748909521801221,1,muazzezeralp,0,,RT @muazzezeralp: @Doan58213655 @denizkonur @N...,7,1,6,0,1442125177727307781,yapikytgrivrlsn,,6,6,1,140,False


## 1.2. From Users

### 1.2.1. Get user metadata from user_profiles.jsons.gz

#### 1.2.1.1. Get user info metadata

In [24]:
def get_user_info_metadata(user_metadata_line):
    
    user_id = user_metadata_line['id_str']
    user_name = user_metadata_line['name']
    user_screen_name = user_metadata_line['screen_name'].lower()
    user_location = user_metadata_line['location']
    user_description = user_metadata_line['description']
    user_followers_count = user_metadata_line['followers_count']
    user_friends_count = user_metadata_line['friends_count']
    user_created_at = user_metadata_line['created_at']
    user_protected = user_metadata_line['protected']
    user_verified = user_metadata_line['verified']
    user_geo_enabled = user_metadata_line['geo_enabled']
    user_tweet_count = user_metadata_line['statuses_count']
    user_fav_count = user_metadata_line['favourites_count']
    
    
    dictionary = {'user_id':user_id, 'user_name': user_name, 'user_screen_name':user_screen_name, 'user_location':user_location,
     'user_description':user_description, 'user_followers_count':user_followers_count, 'user_friends_count':user_friends_count, 
                  'user_created_at':user_created_at, 'user_protected':user_protected,
                  'user_verified': user_verified, 'user_geo_enabled':user_geo_enabled, 
                  'user_tweet_count':user_tweet_count, 'user_fav_count':user_fav_count}

    return dictionary

#### 1.2.1.2. Get followers/(followers+friends) ratio

In [25]:
def get_followers_all_ratio(user_followers_count, user_friends_count):
    
    if user_friends_count + user_followers_count == 0:
        followers_all_ratio = 0

    else:
        followers_all_ratio =  user_followers_count / (user_friends_count + user_followers_count)

    return followers_all_ratio

#### 1.2.1.3. Get description length

In [26]:
def get_desc_len(user_description):
    
    description_len = len(user_description)

    return description_len

#### 1.2.1.3. Get since

In [27]:
import time
from datetime import date
from datetime import datetime

#### 1.1.1.5. Get Number of Digits

In [28]:
def num_of_digits(username):
    count = 0

    for x in username:
        if x.isdigit():
            count+=1

    return count

In [29]:
def get_since(user_created_at):
    if user_created_at != None:
        creation_date = datetime.strptime(user_created_at ,'%a %b %d %H:%M:%S +0000 %Y')
        since = datetime.now() - creation_date
    else:
        since = 0
    return int(since.days)

In [30]:
dfBot = {'user_id':[],
         'user_name':[],
         'user_screen_name':[],
         'user_location':[],
         'user_description':[],
         'user_followers_count':[],
         'user_friends_count':[],
         'description_len':[],
         'followers_to_all_ratio':[],
        'user_created_at':[],
        'user_protected':[],
        'user_verified':[],
         'user_geo_enabled':[],
         'since':[],
         'user_tweet_count':[],
         'user_fav_count':[],
        'average_tweet': [],
        'average_fav': [],
        'count_digits': []}


with gzip.open(f"{PATH_TO_DOWNLOADED}user_profiles.jsons.gz", "rb") as f:
    for line in f:
        line = json.loads(line)

        dictionary = get_user_info_metadata(line)
        for k,v in dictionary.items():
            dfBot[k].append(v)

        
        # manually crafted data:
        description_len = get_desc_len(dictionary['user_description'])
        dfBot['description_len'].append(description_len)
        
        since = get_since(dictionary['user_created_at'])
        dfBot['since'].append(since)
        
        followers_all_ratio = get_followers_all_ratio(dictionary['user_followers_count'], 
                                                      dictionary['user_friends_count'])
        
        dfBot['average_tweet'].append(dictionary['user_tweet_count'] / since)
        dfBot['average_fav'].append(dictionary['user_fav_count'] / since)
        
        dfBot['count_digits'].append(num_of_digits(dictionary['user_screen_name']))

        dfBot['followers_to_all_ratio'].append(followers_all_ratio)

In [31]:
dfBot = pd.DataFrame(dfBot)
dfBot

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_created_at,user_protected,user_verified,user_geo_enabled,since,user_tweet_count,user_fav_count,average_tweet,average_fav,count_digits
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260000,Fri Aug 27 13:07:30 +0000 2021,False,False,True,515,2551,17676,4.953398,34.322330,3
1,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,Fri Sep 11 08:45:44 +0000 2020,False,False,True,865,42771,15474,49.446243,17.889017,0
2,1116042038577958914,Yusuf Aksoy,yusufak63712920,,"Bir şeyden pişmanlık duymak istemiyorsan,her ş...",95,399,64,0.192308,Wed Apr 10 18:15:31 +0000 2019,False,False,False,1384,14300,18220,10.332370,13.164740,8
3,4859899931,Be (VIXX6) ama oppalarının düğününe gidemiyor,nedenburdaysam,Hufflepuff ortak salon,"SMStan\n/St☆rlight ///come on girls,this is ou...",40,83,65,0.325203,Fri Jan 29 11:01:25 +0000 2016,False,False,False,2552,21303,26999,8.347571,10.579545,0
4,2225373636,SLMDMR,biologselim,,BİYOLOG🔬🦠\nNanoteknoloji,100,98,23,0.505051,Sun Dec 01 18:16:41 +0000 2013,False,False,False,3340,1629,2179,0.487725,0.652395,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
29665,1320834618220781569,islammm,islam_mrsj,,,229,217,0,0.513453,Mon Oct 26 21:08:22 +0000 2020,False,False,False,819,2396,10820,2.925519,13.211233,0
29666,111074128,Melda Onur,meldaonur,,"Şekersiz çay, etsiz sofra, SAVAŞSIZ dünya... 🐌...",212457,5428,100,0.975088,Wed Feb 03 18:39:01 +0000 2010,False,False,True,4737,75178,36671,15.870382,7.741398,0
29667,36946875,ali ydm,ali_ydm,"İstanbul, Türkiye",hayat oyunsa bende jeton çok,116,141,28,0.451362,Fri May 01 13:56:23 +0000 2009,False,False,False,5015,6482,7389,1.292522,1.473380,0
29668,2389587396,Türkan Usta,turkanusta,"Ankara, Türkiye",Ustaya sormuşlar; hayatta yaptığın en büyük is...,1669,3639,116,0.314431,Fri Mar 14 18:05:09 +0000 2014,False,False,True,3237,121113,140095,37.415199,43.279271,0


### 1.2.2. Get Tweet Info of Users in user_profiles.jsons.gz

#### 1.2.2.1. Check ratio of retweets to all tweets

In [32]:
def get_retweet_tweet_ratio(line):
    number_retweets = 0
    number_original_tweets = 0

    for tweet in line['tweets']:
        try:
            tweet['retweeted_status']
            number_retweets += 1
                
        except:
            number_original_tweets += 1
            
    total_tweets = number_retweets + number_original_tweets
    
    if total_tweets == 0:
        retweet_total_ratio = None
    else:
        retweet_total_ratio = number_retweets/(total_tweets)
    
    return retweet_total_ratio

#### 1.2.2.2. Check median number of favorites

In [33]:
def get_median_number_favorites(line):
    num_median_favorites = np.median([tweet['favorite_count'] for tweet in line['tweets']])

    return num_median_favorites

### 1.2.3. Collect data using the functions above and transform into a Pandas DataFrame

In [34]:
dfBotTweets = {'user_id':[],
               'retweet_total_ratio':[],
               'num_median_favorites':[],
               'num_of_tweets':[]
              }

i = 0

with gzip.open(f"{PATH_TO_DOWNLOADED}user_tweets.jsons.gz", "rb") as f:
    for line in f:

        line = json.loads(line)

        user_id = line['user_id']
        dfBotTweets['user_id'].append(user_id)
        
        retweet_total_ratio = get_retweet_tweet_ratio(line)
        dfBotTweets['retweet_total_ratio'].append(retweet_total_ratio)
        
        num_median_favorites = get_median_number_favorites(line)
        dfBotTweets['num_median_favorites'].append(num_median_favorites)
        
        dfBotTweets['num_of_tweets'].append(len(line['tweets']))

        i += 1
        if i % 1000 == 0:
            print(i)

  return _methods._mean(a, axis=axis, dtype=dtype,
  ret = ret.dtype.type(ret / rcount)


1000
2000
3000
4000
5000
6000
7000
8000
9000
10000
11000
12000
13000
14000
15000
16000
17000
18000
19000
20000
21000
22000
23000
24000
25000
26000
27000
28000


In [35]:
dfBotTweets = pd.DataFrame(dfBotTweets)
dfBotTweets

Unnamed: 0,user_id,retweet_total_ratio,num_median_favorites,num_of_tweets
0,594642154,0.115000,2.0,200
1,525600289,0.005025,1.0,199
2,931895965501534209,0.900000,0.0,200
3,1591543462746329088,0.185000,0.0,200
4,734801354749796352,1.000000,0.0,200
...,...,...,...,...
28310,1591370361488252928,0.800000,0.0,200
28311,1475272459616235525,0.825000,0.0,200
28312,1096753792731750401,0.051020,1.0,196
28313,1269527617687953409,0.095000,2.0,200


In [36]:
dfBotTweets.describe()

Unnamed: 0,retweet_total_ratio,num_median_favorites,num_of_tweets
count,28232.0,28232.0,28315.0
mean,0.359703,9.76369,189.178739
std,0.351173,215.705174,36.092954
min,0.0,0.0,0.0
25%,0.035,0.0,198.0
50%,0.226131,0.0,200.0
75%,0.675127,1.0,200.0
max,1.0,25659.0,200.0


### 1.2.3. Merge dfBot and dfBotTweets

In [37]:
dfBotTweets[[ 'num_median_favorites','retweet_total_ratio']] = dfBotTweets[[ 'num_median_favorites','retweet_total_ratio']].fillna(0)

In [38]:
dfBotAll = dfBot.merge(dfBotTweets, how='left')

In [39]:
dfBotAll.isna().sum()

user_id                      0
user_name                    0
user_screen_name             0
user_location                0
user_description             0
user_followers_count         0
user_friends_count           0
description_len              0
followers_to_all_ratio       0
user_created_at              0
user_protected               0
user_verified                0
user_geo_enabled             0
since                        0
user_tweet_count             0
user_fav_count               0
average_tweet                0
average_fav                  0
count_digits                 0
retweet_total_ratio       1355
num_median_favorites      1355
num_of_tweets             1355
dtype: int64

In [40]:
dfBotAll[['retweet_total_ratio', 'num_median_favorites','num_of_tweets']] = dfBotAll[['retweet_total_ratio', 'num_median_favorites','num_of_tweets']].fillna(0)

dfBotAll[['user_protected', 'user_verified','user_geo_enabled']] = dfBotAll[['user_protected', 'user_verified','user_geo_enabled']].astype(int)

# 2. TRAIN MODEL

## 2.1. Political Tweet Prediction

### 2.1.1. Merge dfPolitical data with labels

In [41]:
dfPoliticalAll_train = dfPolitical.merge(trainingTweetDf,on='tweet_id')

In [42]:
dfPoliticalAll_train.select_dtypes('bool')
dfPoliticalAll_train['is_verified'] = dfPoliticalAll_train['is_verified'].astype(int)

### 2.1.2. Separate X and y values
We only use 3 features here to create a baseline model. However, it is not enough to get good results.

In [43]:
dfPoliticalAll_train.select_dtypes(exclude=['object']).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3994 entries, 0 to 3993
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   is_retweet              3994 non-null   int64
 1   is_reply                3994 non-null   int64
 2   num_mentions            3994 non-null   int64
 3   num_hashtags            3994 non-null   int64
 4   num_retweets            3994 non-null   int64
 5   num_favorites           3994 non-null   int64
 6   num_political_entities  3994 non-null   int64
 7   total_interactions      3994 non-null   int64
 8   punctuation             3994 non-null   int64
 9   tweet_length            3994 non-null   int64
 10  is_verified             3994 non-null   int32
dtypes: int32(1), int64(10)
memory usage: 358.8 KB


In [44]:
#X = dfPoliticalAll_train[['num_political_entities','total_interactions','num_hashtags']]
#y = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

X = dfPoliticalAll_train.select_dtypes(exclude=['object'])
#X = dfPoliticalAll_train[['is_reply', 'num_mentions', 'num_hashtags', 'num_retweets',
                        #  'num_favorites', 'num_political_entities', 'punctuation', 'is_verified']]
y = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

In [45]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3994 entries, 0 to 3993
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   is_retweet              3994 non-null   int64
 1   is_reply                3994 non-null   int64
 2   num_mentions            3994 non-null   int64
 3   num_hashtags            3994 non-null   int64
 4   num_retweets            3994 non-null   int64
 5   num_favorites           3994 non-null   int64
 6   num_political_entities  3994 non-null   int64
 7   total_interactions      3994 non-null   int64
 8   punctuation             3994 non-null   int64
 9   tweet_length            3994 non-null   int64
 10  is_verified             3994 non-null   int32
dtypes: int32(1), int64(10)
memory usage: 358.8 KB


In [46]:
from sklearn.preprocessing import StandardScaler
num_cols = X.select_dtypes(include=np.number).columns.tolist()

ss = StandardScaler()
X = pd.DataFrame(ss.fit_transform(X[num_cols]), columns=num_cols)

In [47]:
from sklearn.feature_selection import RFE
from sklearn.linear_model import LogisticRegression

# Initialize the model
lr = LogisticRegression()

# Perform RFE and select the top 15 features
rfe = RFE(lr, n_features_to_select=8)
X_new = rfe.fit_transform(X, y)

# Print the features that have been selected
print("Selected features:", [X.columns[i] for i in range(len(X.columns)) if rfe.support_[i]])

Selected features: ['is_reply', 'num_mentions', 'num_retweets', 'num_favorites', 'num_political_entities', 'total_interactions', 'punctuation', 'tweet_length']


In [48]:
X = dfPoliticalAll_train.select_dtypes(exclude=['object'])
#X = dfPoliticalAll_train[ ['is_reply', 'num_mentions', 'num_retweets', 'num_favorites', 'num_political_entities', 'total_interactions', 'punctuation', 'tweet_length']]
y = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

In [49]:
X.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3994 entries, 0 to 3993
Data columns (total 11 columns):
 #   Column                  Non-Null Count  Dtype
---  ------                  --------------  -----
 0   is_retweet              3994 non-null   int64
 1   is_reply                3994 non-null   int64
 2   num_mentions            3994 non-null   int64
 3   num_hashtags            3994 non-null   int64
 4   num_retweets            3994 non-null   int64
 5   num_favorites           3994 non-null   int64
 6   num_political_entities  3994 non-null   int64
 7   total_interactions      3994 non-null   int64
 8   punctuation             3994 non-null   int64
 9   tweet_length            3994 non-null   int64
 10  is_verified             3994 non-null   int32
dtypes: int32(1), int64(10)
memory usage: 358.8 KB


In [86]:
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import SMOTE

# Initialize SMOTE
sm = SMOTE(random_state=42)

# Perform oversampling
X_resampled, y_resampled = sm.fit_resample(X, y)

# Split the data into training and testing sets
X_train, X_valid, y_train, y_valid = train_test_split(X_resampled, y_resampled, test_size=0.2, random_state=42)

In [87]:
from sklearn.ensemble import AdaBoostClassifier, RandomForestClassifier
from xgboost import XGBClassifier
from lightgbm import LGBMClassifier
from sklearn.ensemble import VotingClassifier

# Initialize the models
ada = AdaBoostClassifier(n_estimators=100, learning_rate=0.1)
xgb = XGBClassifier(n_estimators=100, max_depth=5, learning_rate=0.1)
lgb = LGBMClassifier(n_estimators=100, max_depth=5, learning_rate=0.1)
rf = RandomForestClassifier(n_estimators=100, max_depth=5)

# Create the ensemble model
ensemble = VotingClassifier(estimators=[('ada', ada), ('xgb', xgb), ('lgb', lgb), ('rf', rf)], voting='hard')

# Fit the ensemble model on the training data
ensemble.fit(X_train, y_train)

# Make predictions on the test data
y_pred = ensemble.predict(X_valid)

# Print the accuracy
print("Accuracy:", ensemble.score(X_valid, y_valid))

Accuracy: 0.8127009646302251


In [88]:
X_resampled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6220 entries, 0 to 6219
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_followers_count    6220 non-null   float64
 1   user_friends_count      6220 non-null   float64
 2   description_len         6220 non-null   float64
 3   followers_to_all_ratio  6220 non-null   float64
 4   user_protected          6220 non-null   float64
 5   user_verified           6220 non-null   float64
 6   user_geo_enabled        6220 non-null   float64
 7   since                   6220 non-null   float64
 8   user_tweet_count        6220 non-null   float64
 9   user_fav_count          6220 non-null   float64
 10  average_tweet           6220 non-null   float64
 11  average_fav             6220 non-null   float64
 12  count_digits            6220 non-null   float64
 13  retweet_total_ratio     6220 non-null   float64
 14  num_median_favorites    6220 non-null   

### 2.1.3. Train - validation split

In [89]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

### 2.1.4. Train the model

Here, you may use different models such as neural networks, XGBoost, AdaBoost, RandomForest, Linear Regression, Logistic Regression etc. to see which model does the best. Also, you can use grid_search_cv() or a basic for loop to optimize the hyperparameters of your model.

In [98]:
from sklearn.feature_selection import SelectKBest
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split



# Feature selection
# Select the top 10 best features
selector = SelectKBest(k=9)
X_new = selector.fit_transform(X, y)

# Split the data into training and test sets
X_train, X_valid, y_train, y_valid = train_test_split(X_new, y, test_size=0.2)

# Neural network
clf = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10000, random_state=0)
clf.fit(X_train, y_train)

# Evaluation
accuracy = clf.score(X_test, y_test)
print("Accuracy:", accuracy)

Accuracy: 0.7468030690537084


In [99]:
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, accuracy_score, recall_score, precision_score, f1_score
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import StackingClassifier
from sklearn.ensemble import VotingClassifier

In [100]:
# Import Library
from sklearn.linear_model import LogisticRegression  
from sklearn.model_selection import cross_val_score
from sklearn.metrics import confusion_matrix, classification_report, roc_curve, roc_auc_score, precision_recall_curve
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

In [101]:
LR= LogisticRegression(max_iter=300).fit(X_train, y_train)
# predicting
preds = LR.predict(X_valid)

# classification report
target_names = ['bot', 'not bot']
print('Classification_Report:')
print(classification_report(y_valid, preds, digits=4, target_names = target_names))

Classification_Report:
              precision    recall  f1-score   support

         bot     0.8363    0.9728    0.8994       625
     not bot     0.6909    0.2420    0.3585       157

    accuracy                         0.8261       782
   macro avg     0.7636    0.6074    0.6289       782
weighted avg     0.8071    0.8261    0.7908       782



In [102]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.17391304347826086 
 Accuracy Score: 0.8260869565217391 
 Confusion Matrix: 
 [[608  17]
 [119  38]]


In [103]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc_p=LGBMClassifier(n_estimators=500, learning_rate=0.05, num_leaves=32, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)


lgbc_p.fit(X_train, y_train)
preds = lgbc_p.predict(X_valid)

In [104]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.17519181585677748 
 Accuracy Score: 0.8248081841432225 
 Confusion Matrix: 
 [[600  25]
 [112  45]]


In [105]:
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier
from sklearn.neighbors import KNeighborsClassifier
from xgboost import XGBClassifier

In [106]:
adb_ros = AdaBoostClassifier(n_estimators = 100)
adb_ros.fit(X_train, y_train)

#predicting
preds = adb_ros.predict(X_valid)


In [107]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.17774936061381075 
 Accuracy Score: 0.8222506393861893 
 Confusion Matrix: 
 [[594  31]
 [108  49]]


from sklearn.model_selection import GridSearchCV

rfc=RandomForestClassifier(random_state=42)
param_grid = { 
    'n_estimators': [200, 500],
    'max_features': ['auto', 'sqrt', 'log2'],
    'max_depth' : [4,5,6,7,8],
    'criterion' :['gini', 'entropy']
}
CV_rfc = GridSearchCV(estimator=rfc, param_grid=param_grid, cv= 5)
CV_rfc.fit(X_train, y_train)


CV_rfc.best_params_

In [108]:
rf_ros_p = RandomForestClassifier(max_depth=20, n_estimators=3000)
rf_ros_p.fit(X_train, y_train)

#predicting
preds = rf_ros_p.predict(X_valid)

In [109]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.1636828644501279 
 Accuracy Score: 0.8363171355498721 
 Confusion Matrix: 
 [[604  21]
 [107  50]]


In [110]:
rf_ros_b = RandomForestClassifier(criterion= 'gini',
 max_depth= 20,
 max_features ='auto',
 n_estimators = 800)
rf_ros_b.fit(X_train, y_train)

#predicting
preds = rf_ros_b.predict(X_valid)

  warn(


In [111]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.16751918158567775 
 Accuracy Score: 0.8324808184143222 
 Confusion Matrix: 
 [[604  21]
 [110  47]]


from sklearn import preprocessing
from sklearn.cross_validation import train_test_split


from sklearn.grid_search import GridSearchCV

estimator = XGBClassifier(
    objective= 'binary:logistic',
    nthread=4,
    seed=42
)

xgb_model = XGBClassifier()
parameters = {
    'max_depth': range (2, 10, 1),
    'n_estimators': range(60, 220, 40),
    'learning_rate': [0.1, 0.01, 0.05]
}

grid_search = GridSearchCV(
    estimator=estimator,
    param_grid=parameters,
    scoring = 'roc_auc',
    n_jobs = 10,
    cv = 10,
    verbose=True
)

grid_search.fit(X_train, y_train)

grid_search.best_params_

In [112]:
from xgboost import XGBClassifier
xgb_ros_p = XGBClassifier(learning_rate= 0.1, max_depth =  4, n_estimators =  60)
xgb_ros_p.fit(X_train, y_train)

#predicting
preds = xgb_ros_p.predict(X_valid)


In [113]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.17135549872122763 
 Accuracy Score: 0.8286445012787724 
 Confusion Matrix: 
 [[601  24]
 [110  47]]


In [114]:
import keras.utils
from tensorflow.keras import utils as np_utils
from tensorflow.keras.utils import to_categorical
from keras.models import Sequential
from keras.layers import Dense, Dropout, Activation, Flatten
from tensorflow.keras.optimizers import SGD, Adam
import tensorflow as tf
model_1 = tf.keras.Sequential()
model_1.add(tf.keras.layers.Dense(100, activation='relu', name='hidden_layer_1'))
model_1.add(tf.keras.layers.Dense(50, activation='relu', name='hidden_layer_2'))
model_1.add(tf.keras.layers.Dense(25, activation='relu', name='hidden_layer_3'))
model_1.add(tf.keras.layers.Dense(1, name='output_layer'))

# compile your model with an optimizer
model_1.compile(loss='mean_squared_error', optimizer = Adam(learning_rate=0.001), metrics=['mse','accuracy'])


model_1.fit(X_train, y_train, epochs=50, batch_size=64, shuffle=True, verbose=1)

Epoch 1/50
Epoch 2/50
Epoch 3/50
Epoch 4/50
Epoch 5/50
Epoch 6/50
Epoch 7/50
Epoch 8/50
Epoch 9/50
Epoch 10/50
Epoch 11/50
Epoch 12/50
Epoch 13/50
Epoch 14/50
Epoch 15/50
Epoch 16/50
Epoch 17/50
Epoch 18/50
Epoch 19/50
Epoch 20/50
Epoch 21/50
Epoch 22/50
Epoch 23/50
Epoch 24/50
Epoch 25/50
Epoch 26/50
Epoch 27/50
Epoch 28/50
Epoch 29/50
Epoch 30/50
Epoch 31/50
Epoch 32/50
Epoch 33/50
Epoch 34/50
Epoch 35/50
Epoch 36/50
Epoch 37/50
Epoch 38/50
Epoch 39/50
Epoch 40/50
Epoch 41/50
Epoch 42/50
Epoch 43/50
Epoch 44/50
Epoch 45/50
Epoch 46/50
Epoch 47/50
Epoch 48/50
Epoch 49/50
Epoch 50/50


<keras.callbacks.History at 0x24ae06fef70>

preds = model_1.predict(X_valid)

preds

acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

In [116]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
dtc_political = DecisionTreeClassifier()
param_grid = {'criterion':['gini','entropy'], 'max_depth':[3,5,7,9,11]}
grid = GridSearchCV(dtc_political,
                    param_grid, 
                    cv=28,
                    scoring='precision',
                    return_train_score=False, 
                    verbose=1, 
                    refit=True)


# fit your model
grid_search = grid.fit(X_train, y_train)

# make predictions
preds = grid_search.predict(X_valid)

# evaluate on validation set
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

Fitting 28 folds for each of 10 candidates, totalling 280 fits


  _warn_prf(average, modifier, msg_start, len(result))
  _warn_prf(average, modifier, msg_start, len(result))


MSE: 0.20076726342711 
 Accuracy Score: 0.7992327365728901 
 Confusion Matrix: 
 [[592  33]
 [124  33]]


from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
dtc_political = DecisionTreeClassifier()

# fit your model
dtc_political.fit(X_train, y_train)

# make predictions
preds = dtc_political.predict(X_valid)

# evaluate on validation set
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

## 2.2. Bot Detection

### 2.2.1. Merge dfBotAll data with labels

In [117]:
dfBotAll.user_screen_name = dfBotAll.user_screen_name.str.lower()

In [118]:
dfBotAll_train = dfBotAll.merge(trainingUserDf,
                               left_on='user_screen_name',
                               right_on='screen_name')

dfBotAll_train

Unnamed: 0,user_id,user_name,user_screen_name,user_location,user_description,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_created_at,user_protected,user_verified,user_geo_enabled,since,user_tweet_count,user_fav_count,average_tweet,average_fav,count_digits,retweet_total_ratio,num_median_favorites,num_of_tweets,screen_name,isBot
0,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260000,Fri Aug 27 13:07:30 +0000 2021,0,0,1,515,2551,17676,4.953398,34.322330,3,0.395939,0.0,197.0,nasreenakhan006,No
1,1431241870848450577,Nasreena Khan Wazir,nasreenakhan006,"Islamabad, Pakistan",Student,65,185,7,0.260000,Fri Aug 27 13:07:30 +0000 2021,0,0,1,515,2551,17676,4.953398,34.322330,3,0.395939,0.0,197.0,nasreenakhan006,No
2,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,Fri Sep 11 08:45:44 +0000 2020,0,0,1,865,42771,15474,49.446243,17.889017,0,0.125000,0.0,200.0,scorpiehoez,No
3,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,Fri Sep 11 08:45:44 +0000 2020,0,0,1,865,42771,15474,49.446243,17.889017,0,0.125000,0.0,200.0,scorpiehoez,No
4,1304340303080386560,fania :((((,scorpiehoez,bogor,have a holly jolly🎄,8235,3011,19,0.732260,Fri Sep 11 08:45:44 +0000 2020,0,0,1,865,42771,15474,49.446243,17.889017,0,0.125000,0.0,200.0,scorpiehoez,No
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3904,1286770207134973954,Hamide Arabacı,anka6054,,,151,61,0,0.712264,Fri Jul 24 21:08:34 +0000 2020,0,0,0,913,5288,5347,5.791895,5.856517,4,0.000000,1.0,200.0,anka6054,No
3905,1598032338323214338,atamabekleyenbahceci,atamabekleyenzz,,,173,367,0,0.320370,Wed Nov 30 19:13:03 +0000 2022,0,0,0,54,311,196,5.759259,3.629630,0,0.580000,0.0,200.0,atamabekleyenzz,No
3906,760235343966863360,Emrah İNCİ,memrahinci,Istanbul - Bayburt,Researcher | Middle East | Political Science |...,5863,5905,71,0.498215,Mon Aug 01 22:06:45 +0000 2016,0,0,1,2366,1029,725,0.434911,0.306424,0,0.040000,36.0,200.0,memrahinci,No
3907,1553973684100124672,Murat Kkk,muratkkk18,,Normal sıradan bir insanım,1,10,26,0.090909,Mon Aug 01 05:19:56 +0000 2022,0,0,0,176,18,38,0.102273,0.215909,2,0.769231,0.0,13.0,muratkkk18,No


### 2.2.2. Separate X and y values
We use only 4 features here to create a baseline model. However, it is not enough to get good results.

In [119]:
X = dfBotAll_train.select_dtypes(exclude=['object'])
y = dfBotAll_train.isBot.apply(lambda x: 1 if x=='Yes' else 0)

In [120]:
dfBotAll_train.select_dtypes(exclude=['object']).info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3909 entries, 0 to 3908
Data columns (total 16 columns):
 #   Column                  Non-Null Count  Dtype  
---  ------                  --------------  -----  
 0   user_followers_count    3909 non-null   int64  
 1   user_friends_count      3909 non-null   int64  
 2   description_len         3909 non-null   int64  
 3   followers_to_all_ratio  3909 non-null   float64
 4   user_protected          3909 non-null   int32  
 5   user_verified           3909 non-null   int32  
 6   user_geo_enabled        3909 non-null   int32  
 7   since                   3909 non-null   int64  
 8   user_tweet_count        3909 non-null   int64  
 9   user_fav_count          3909 non-null   int64  
 10  average_tweet           3909 non-null   float64
 11  average_fav             3909 non-null   float64
 12  count_digits            3909 non-null   int64  
 13  retweet_total_ratio     3909 non-null   float64
 14  num_median_favorites    3909 non-null   

In [121]:
from sklearn.preprocessing import StandardScaler
num_cols = X.select_dtypes(include=np.number).columns.tolist()

ss = StandardScaler()
X = pd.DataFrame(ss.fit_transform(X[num_cols]), columns=num_cols)

### 2.2.3. Train-test split

In [122]:
X_train.shape

(3127, 9)

In [123]:
import numpy as np
from keras.models import Sequential
from keras.layers import Dense, LSTM
from sklearn.model_selection import train_test_split

# Load the account features
account_features = X # list of account features (e.g., number of followers, account age, etc.)
labels = y # list of labels (bot or not)

#Split the data into train and test sets
X_train, X_test, y_train, y_test = train_test_split(account_features, labels, test_size=0.2)

# Define the model
model = Sequential()
model.add(LSTM(units=32, input_shape=(account_features.shape[1],1)))
model.add(Dense(1, activation='sigmoid'))

# Compile the model
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])

# Reshape the data for LSTM


# Train the model
model.fit(X_train, y_train, batch_size=32, epochs=30)

# Evaluate the model on the test set
score, acc = model.evaluate(X_test, y_test, batch_size=32)
print('Test score:', score)
print('Test accuracy:', acc)

Epoch 1/30
Epoch 2/30
Epoch 3/30
Epoch 4/30
Epoch 5/30
Epoch 6/30
Epoch 7/30
Epoch 8/30
Epoch 9/30
Epoch 10/30
Epoch 11/30
Epoch 12/30
Epoch 13/30
Epoch 14/30
Epoch 15/30
Epoch 16/30
Epoch 17/30
Epoch 18/30
Epoch 19/30
Epoch 20/30
Epoch 21/30
Epoch 22/30
Epoch 23/30
Epoch 24/30
Epoch 25/30
Epoch 26/30
Epoch 27/30
Epoch 28/30
Epoch 29/30
Epoch 30/30
Test score: 0.4460867643356323
Test accuracy: 0.7992327213287354


In [124]:
from sklearn.model_selection import train_test_split

X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size=0.20, random_state=42)

In [125]:
LR= LogisticRegression(max_iter=300).fit(X_train, y_train)
# predicting
preds = LR.predict(X_valid)

# classification report
target_names = ['bot', 'not bot']
print('Classification_Report:')
print(classification_report(y_valid, preds, digits=4, target_names = target_names))

Classification_Report:
              precision    recall  f1-score   support

         bot     0.8254    0.9760    0.8944       625
     not bot     0.6512    0.1783    0.2800       157

    accuracy                         0.8159       782
   macro avg     0.7383    0.5772    0.5872       782
weighted avg     0.7905    0.8159    0.7711       782



In [126]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.18414322250639387 
 Accuracy Score: 0.8158567774936062 
 Confusion Matrix: 
 [[610  15]
 [129  28]]


In [127]:
adb_ros = AdaBoostClassifier(n_estimators = 250)
adb_ros.fit(X_train, y_train)

#predicting
preds = adb_ros.predict(X_valid)


In [128]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.17647058823529413 
 Accuracy Score: 0.8235294117647058 
 Confusion Matrix: 
 [[602  23]
 [115  42]]


In [129]:
rf_ros_b = RandomForestClassifier(criterion= 'entropy',
 max_depth= 14,
 max_features ='auto',
 n_estimators = 800)
rf_ros_b.fit(X_train, y_train)

#predicting
preds = rf_ros_b.predict(X_valid)

  warn(


In [130]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.1636828644501279 
 Accuracy Score: 0.8363171355498721 
 Confusion Matrix: 
 [[601  24]
 [104  53]]


In [131]:
from xgboost import XGBClassifier
xgb_ros_b = XGBClassifier(learning_rate= 0.1, max_depth =  17, n_estimators =  95)
xgb_ros_b.fit(X_train, y_train)

#predicting
preds = xgb_ros_b.predict(X_valid)


In [132]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.16879795396419436 
 Accuracy Score: 0.8312020460358056 
 Confusion Matrix: 
 [[593  32]
 [100  57]]


In [133]:
from sklearn.feature_selection import SelectFromModel
from lightgbm import LGBMClassifier

lgbc_b=LGBMClassifier(n_estimators=500, learning_rate=0.06, num_leaves=64, colsample_bytree=0.2,
            reg_alpha=3, reg_lambda=1, min_split_gain=0.01, min_child_weight=40)


lgbc_b.fit(X_train, y_train)
preds = lgbc_b.predict(X_valid)

In [134]:
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.17263427109974425 
 Accuracy Score: 0.8273657289002557 
 Confusion Matrix: 
 [[602  23]
 [112  45]]


In [135]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
dtc_political = DecisionTreeClassifier()
param_grid = {'criterion':['gini','entropy'], 'max_depth':[3,5,7,9,11]}
grid = GridSearchCV(dtc_political,
                    param_grid, 
                    cv=28,
                    scoring='precision',
                    return_train_score=False, 
                    verbose=1, 
                    refit=True)


# fit your model
grid_search = grid.fit(X_train, y_train)

# make predictions
preds = grid_search.predict(X_valid)

# evaluate on validation set
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)
mse = mean_squared_error(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

Fitting 28 folds for each of 10 candidates, totalling 280 fits
MSE: 0.18797953964194372 
 Accuracy Score: 0.8120204603580563 
 Confusion Matrix: 
 [[611  14]
 [133  24]]


### 2.2.4. Train the model

In [136]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, mean_squared_error

# create an instance
dtc_bot = DecisionTreeClassifier()

# fit your model
dtc_bot.fit(X_train, y_train)

# make predictions
preds = dtc_bot.predict(X_valid)

# evaluate on validation set
acc_score = accuracy_score(y_valid, preds)
confusion = confusion_matrix(y_valid, preds)

print("MSE:", mse, "\n",
      "Accuracy Score:", acc_score, "\n",
      "Confusion Matrix:", "\n", confusion)

MSE: 0.18797953964194372 
 Accuracy Score: 0.7838874680306905 
 Confusion Matrix: 
 [[550  75]
 [ 94  63]]


# 3. MAKE PREDICTIONS

Here, you will make predictions with the models that you have trained above.

## 3.1. Predictions for Tweets (Political or Not)

In [137]:
#X = dfPoliticalAll_train[['num_political_entities','total_interactions','num_hashtags']]
#y = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

X_train = dfPoliticalAll_train.select_dtypes(exclude=['object'])
#X = dfPoliticalAll_train[['is_reply', 'num_mentions', 'num_hashtags', 'num_retweets',
 #                         'num_favorites', 'num_political_entities', 'punctuation', 'is_verified']]
y_train = dfPoliticalAll_train['isPolitical'].apply(lambda x: 1 if x=='Yes' else 0)

In [138]:
X_train.shape

(3994, 11)

In [139]:
from sklearn.preprocessing import StandardScaler
num_cols = X_train.select_dtypes(include=np.number).columns.tolist()

ss = StandardScaler()
X_train = pd.DataFrame(ss.fit_transform(X_train[num_cols]), columns=num_cols)

In [140]:
X_train

Unnamed: 0,is_retweet,is_reply,num_mentions,num_hashtags,num_retweets,num_favorites,num_political_entities,total_interactions,punctuation,tweet_length,is_verified
0,0.0,-1.752528,-0.908933,-0.294296,3.708557,0.896067,-0.515844,1.226528,1.372829,0.973589,-0.135492
1,0.0,-1.752528,-0.908933,-0.294296,3.708557,0.896067,-0.515844,1.226528,1.372829,0.973589,-0.135492
2,0.0,-1.752528,-0.908933,-0.294296,3.708557,0.896067,-0.515844,1.226528,1.372829,0.973589,-0.135492
3,0.0,-1.752528,-0.908933,-0.294296,3.708557,0.896067,-0.515844,1.226528,1.372829,0.973589,-0.135492
4,0.0,-1.752528,-0.908933,-0.294296,3.708557,0.896067,-0.515844,1.226528,1.372829,0.973589,-0.135492
...,...,...,...,...,...,...,...,...,...,...,...
3989,0.0,0.570604,-0.208107,-0.294296,-0.113063,-0.073192,-0.090342,-0.078573,-1.396869,-1.765145,-0.135492
3990,0.0,-1.752528,1.894370,-0.294296,-0.113063,-0.073192,0.335160,-0.078573,0.449596,0.857541,-0.135492
3991,0.0,0.570604,1.193545,-0.294296,-0.113063,-0.073192,1.186164,-0.078573,1.372829,0.973589,-0.135492
3992,0.0,0.570604,-0.208107,-0.294296,-0.113063,-0.073192,-0.941346,-0.078573,-0.473636,-1.347372,-0.135492


In [141]:
xgb_ros_p.fit(X_train, y_train)

In [142]:
X.shape

(3909, 16)

In [143]:
# read the evaluation file as follows

evaluationTweetDf = pd.read_csv(DATA_PATH+'evaluation-round{}-tweet.csv'.format(ROUND), dtype={0: str}, header=None, names=['tweet_id'])
evaluationTweetDf = evaluationTweetDf.dropna()
evaluationTweetDf

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfPolitical_test = dfPolitical.merge(evaluationTweetDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfPolitical_test.select_dtypes(exclude=['object'])
ss = StandardScaler()
X = pd.DataFrame(ss.fit_transform(X[num_cols]), columns=num_cols)
# make predictions based on these variables
predictions_political = xgb_ros_p.predict(X)

In [144]:
predictions_political

array([1, 0, 1, ..., 0, 1, 1])

### This part is important! We expect you to return your predictions in the following format:

In [145]:
modelPredTweet = dict([(x,float(y)) for x,y in zip([*dfPolitical_test.tweet_id], predictions_political)])
modelPredTweet

{'1593649159009099777': 1.0,
 '1367571642604544000': 0.0,
 '1589993032975544320': 1.0,
 '1565312596135354373': 1.0,
 '1388235183653011462': 0.0,
 '1592120408073203712': 1.0,
 '1439547067337256967': 0.0,
 '1597274845381029888': 1.0,
 '1586021183958704128': 1.0,
 '1356926480605982728': 1.0,
 '1595357036925026306': 1.0,
 '1585766233491886081': 1.0,
 '1595871258985615361': 1.0,
 '1352635736537882629': 1.0,
 '1583477966373543936': 1.0,
 '1564926450096013313': 1.0,
 '1585634359612420101': 1.0,
 '1384499047390658560': 0.0,
 '1596583748669419521': 0.0,
 '1391681495622995971': 0.0,
 '1365710259549966339': 0.0,
 '1590673118397624323': 0.0,
 '1389951943343316995': 0.0,
 '1407921226656280580': 0.0,
 '1452348722810138646': 1.0,
 '1597256187325878273': 0.0,
 '1595829502021623812': 1.0,
 '1579408398894137344': 1.0,
 '1570758749606019073': 1.0,
 '1366091745772077058': 0.0,
 '1596233602886701057': 1.0,
 '1584922292127256577': 0.0,
 '1586279180983042050': 1.0,
 '1399687111234756612': 0.0,
 '136322656468

## 3.2. Predictions for Users (Bot or Not)

In [146]:
X_train = dfBotAll_train.select_dtypes(exclude=['object'])
y_train = dfBotAll_train.isBot.apply(lambda x: 1 if x=='Yes' else 0)

In [147]:
from sklearn.preprocessing import StandardScaler
num_cols = X_train.select_dtypes(include=np.number).columns.tolist()

ss = StandardScaler()
X_train = pd.DataFrame(ss.fit_transform(X_train[num_cols]), columns=num_cols)

In [148]:
X_train.shape

(3909, 16)

In [149]:
X_train

Unnamed: 0,user_followers_count,user_friends_count,description_len,followers_to_all_ratio,user_protected,user_verified,user_geo_enabled,since,user_tweet_count,user_fav_count,average_tweet,average_fav,count_digits,retweet_total_ratio,num_median_favorites,num_of_tweets
0,-0.106669,-0.181144,-0.772688,-0.907499,-0.150874,-0.16203,1.326873,-0.911696,-0.312182,-0.261804,-0.368987,0.105576,0.565100,0.116444,-0.075984,0.256128
1,-0.106669,-0.181144,-0.772688,-0.907499,-0.150874,-0.16203,1.326873,-0.911696,-0.312182,-0.261804,-0.368987,0.105576,0.565100,0.116444,-0.075984,0.256128
2,-0.047002,0.035492,-0.524441,0.908195,-0.150874,-0.16203,1.326873,-0.676057,0.207421,-0.288974,0.720860,-0.201688,-0.633693,-0.647979,-0.075984,0.323903
3,-0.047002,0.035492,-0.524441,0.908195,-0.150874,-0.16203,1.326873,-0.676057,0.207421,-0.288974,0.720860,-0.201688,-0.633693,-0.647979,-0.075984,0.323903
4,-0.047002,0.035492,-0.524441,0.908195,-0.150874,-0.16203,1.326873,-0.676057,0.207421,-0.288974,0.720860,-0.201688,-0.633693,-0.647979,-0.075984,0.323903
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
3904,-0.106041,-0.190649,-0.917498,0.831316,-0.150874,-0.16203,-0.753651,-0.643740,-0.276823,-0.413928,-0.348448,-0.426667,0.964698,-1.000652,-0.067039,0.323903
3905,-0.105880,-0.167192,-0.917498,-0.675394,-0.150874,-0.16203,-0.753651,-1.222067,-0.341121,-0.477485,-0.349248,-0.468304,-0.633693,0.635751,-0.075984,0.323903
3906,-0.064325,0.257340,0.551294,0.008365,-0.150874,-0.16203,1.326873,0.334499,-0.331845,-0.470958,-0.479667,-0.530440,-0.633693,-0.887797,0.246019,0.323903
3907,-0.107136,-0.194559,-0.379631,-1.557601,-0.150874,-0.16203,-0.753651,-1.139929,-0.344906,-0.479435,-0.487815,-0.532133,0.165502,1.169643,-0.075984,-3.900772


In [150]:
adb_ros.fit(X_train,y_train)

In [151]:
evaluationUserDf = pd.read_csv(DATA_PATH+'evaluation-round{}-user.csv'.format(ROUND), dtype={0: str}, header=None, names=['user_screen_name'])
evaluationUserDf = evaluationUserDf.dropna()

# merge it with the political dataframe so that you can use the make predictions based on the variables
dfBot_test = dfBotAll.merge(evaluationUserDf)

# define X as we did above in section (2.x.2. Separate X and y values)
X = dfBot_test.select_dtypes(exclude=['object'])
ss = StandardScaler()
X = pd.DataFrame(ss.fit_transform(X[num_cols]), columns=num_cols)
# make predictions based on these variables
predictions_bot = adb_ros.predict(X)

In [152]:
X.shape

(5000, 16)

In [153]:
modelPredUser = dict([(x,float(y)) for x,y in zip([*dfBot_test.user_screen_name], predictions_bot)])
modelPredUser

{'nedenburdaysam': 0.0,
 'biologselim': 0.0,
 'alaraaynncnm': 0.0,
 '_sydneycarton_': 0.0,
 'denizlihabercom': 0.0,
 'burakerbaychp': 0.0,
 'mustafaarst': 0.0,
 'mvnez': 0.0,
 'qara118': 0.0,
 'alpar_kaan': 0.0,
 'farukhalit2': 0.0,
 'haf_zhan': 0.0,
 'harlunoshi': 0.0,
 'heritagepaix': 0.0,
 '37baho37': 0.0,
 'tamerduran_1': 0.0,
 'donkisotumsu': 0.0,
 'nuranwolf': 0.0,
 'politikgundem': 0.0,
 'isakethudax': 0.0,
 'ilaydejaneiro': 0.0,
 'gendenmukatol': 0.0,
 '1905anason': 0.0,
 'eraydurgut03': 0.0,
 'dasiskein': 0.0,
 'mett_1907': 0.0,
 'semihyeteer': 0.0,
 'haberinyokcokk': 0.0,
 'meleky_ozaydin': 0.0,
 'han34nesli': 0.0,
 'bilobi4': 0.0,
 'berkeduranovic': 0.0,
 'cagdasadim': 0.0,
 'merabalare': 0.0,
 'sevdaac72373936': 1.0,
 '21gramlife1': 0.0,
 'cakan0_': 0.0,
 'oguzksalici': 0.0,
 'emre_caliskann': 0.0,
 'mehmet07454846': 0.0,
 'lifegs': 0.0,
 'begumkarabeyx': 0.0,
 'avutulan': 0.0,
 'imamgibiimam': 0.0,
 'durdane52': 0.0,
 'radyotrafik35': 0.0,
 'rk_ozanyali': 0.0,
 'uykusuz75'

# PREPARE SUBMISSION

You will need to submit exact same file produced by using the following code. Any deviation from the desired format willbe marked as 0.

In [154]:
# Explain your approach

data_explanations = '''
To strengthen our training data, we concatenated all annotation files (from each group member) with the main training datasets.
Since the given prediction test dataset can not be verified (no labels provided), we divided our dataset into two subsets of 80% allocated for training and 20% set aside for testing, in order to evaluate the performance of our machine learning models.
'''

feature_explanations = '''
For feature extraction process, we discussed our annotation methodologies for both isPolitical and isBot parts.
Based on our common approaches, we selected the most useful features which were not included in the base template model.

created_at = the account's creation date.
since: it refers to the time passed from date that the account has been created 
user_geo_enabled : it is 1 if the user enabled his location.
user_verified : it refers to whether the account has verified by Twitter or not. 
count_digits : It refers to the amount of digits that exist in a username.
user_protected: It is 1 if the user has a private account.
user_followers_count: the number of followers that a user has.
user_friends_count: the number of people that the user is following.
description_len: the number of characters of the bio (description) of the user. 
user_tweet_count: the number of tweets that the user tweeted.
user_fav_count: the number of tweets that the user favorited.
average_tweet: the average amount of tweet that user tweeted since he joined to Twitter.
average_fav: the average amount of tweets that user favorited since he joined to Twitter.
followers_to_all_ratio: followers / (followers + friends)
is_reply: it indicates whether the tweet is a reply to another tweet.
replied_username: the username of the tweet which is replied.
tweet_length: the lenght of the tweet.
punctuation: the number of punctuation marks.

and additionally, we added more keywords to the political entities list.
'''

model_explanations = '''
In our modelling we had used following modelling methodologies and compared their Accuracy and MSE scores to determine best model.
GridSearch Cross Validation is applied to XGBoost, AdaBoost, DecisionTree to find best hyperparameter values.
Standard scaling is applied to all numerical values (normalization).


Here our tested Models 
    Decision Tree Accuracy: MSE:
    Random Forest Accuracy: MSE:
    Logistic Regression Accuracy: MSE:
    XGBoost Accuracy: MSE:
    AdaBoost Accuracy: MSE:
    LGBM Accuracy: MSE:
    NLP-BERT Accuracy: 0.85 MSE: 0.154
    KNN (K-Nearest Neighbors)
    Stacking Classifier
    Soft Voting Classifier
    Gaussian Naive Bayes 
    
We are planning to utilize NPL-BERT model (NLP, encoded-base tranformer model-BERT) for isPolitical part in the next round,
and CNN for the isBot part.


For Bot Test We used 

For Political Test We used 
'''

additional_explanations = '''
It seems that improving the quality and the quantity of the political entity list, increases the accuracy rate we have obtained significantly.
Also the adding the annotation files to the training set is something we have brain-stormed.
'''

In [155]:
ROUND = 1
STUDENT_ID = 26772


In [156]:
import json
 
# Opening JSON file
with open(DATA_PATH+'dict.json') as json_file:
    data = json.load(json_file)

In [157]:
data

{'1593649159009099777': 1.0,
 '1367571642604544000': 0.0,
 '1589993032975544320': 1.0,
 '1565312596135354373': 1.0,
 '1388235183653011462': 0.0,
 '1592120408073203712': 1.0,
 '1439547067337256967': 0.0,
 '1597274845381029888': 1.0,
 '1586021183958704128': 1.0,
 '1356926480605982728': 0.0,
 '1595357036925026306': 1.0,
 '1585766233491886081': 1.0,
 '1595871258985615361': 1.0,
 '1352635736537882629': 0.0,
 '1583477966373543936': 1.0,
 '1564926450096013313': 0.0,
 '1585634359612420101': 1.0,
 '1384499047390658560': 0.0,
 '1596583748669419521': 0.0,
 '1391681495622995971': 0.0,
 '1365710259549966339': 0.0,
 '1590673118397624323': 1.0,
 '1389951943343316995': 0.0,
 '1407921226656280580': 0.0,
 '1452348722810138646': 1.0,
 '1597256187325878273': 1.0,
 '1595829502021623812': 1.0,
 '1579408398894137344': 1.0,
 '1570758749606019073': 1.0,
 '1366091745772077058': 0.0,
 '1596233602886701057': 1.0,
 '1584922292127256577': 1.0,
 '1586279180983042050': 1.0,
 '1399687111234756612': 0.0,
 '136322656468

In [158]:
predictions = {
    'round': ROUND,
    'student_id': STUDENT_ID,
    'user_predictions': modelPredUser,
    'tweet_predictions': data,
    'explanations': {
        'data': data_explanations,
        'feature': feature_explanations,
        'model': model_explanations,
        'other': additional_explanations,
    }
}


with open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'w') as fl:
    fl.write(json.dumps(predictions, indent=4))

In [159]:
# Test your submission file

submission = json.load(open('predictions-{}_round{}.json'.format(STUDENT_ID, ROUND), 'r'))
submission

{'round': 1,
 'student_id': 26772,
 'user_predictions': {'nedenburdaysam': 0.0,
  'biologselim': 0.0,
  'alaraaynncnm': 0.0,
  '_sydneycarton_': 0.0,
  'denizlihabercom': 0.0,
  'burakerbaychp': 0.0,
  'mustafaarst': 0.0,
  'mvnez': 0.0,
  'qara118': 0.0,
  'alpar_kaan': 0.0,
  'farukhalit2': 0.0,
  'haf_zhan': 0.0,
  'harlunoshi': 0.0,
  'heritagepaix': 0.0,
  '37baho37': 0.0,
  'tamerduran_1': 0.0,
  'donkisotumsu': 0.0,
  'nuranwolf': 0.0,
  'politikgundem': 0.0,
  'isakethudax': 0.0,
  'ilaydejaneiro': 0.0,
  'gendenmukatol': 0.0,
  '1905anason': 0.0,
  'eraydurgut03': 0.0,
  'dasiskein': 0.0,
  'mett_1907': 0.0,
  'semihyeteer': 0.0,
  'haberinyokcokk': 0.0,
  'meleky_ozaydin': 0.0,
  'han34nesli': 0.0,
  'bilobi4': 0.0,
  'berkeduranovic': 0.0,
  'cagdasadim': 0.0,
  'merabalare': 0.0,
  'sevdaac72373936': 1.0,
  '21gramlife1': 0.0,
  'cakan0_': 0.0,
  'oguzksalici': 0.0,
  'emre_caliskann': 0.0,
  'mehmet07454846': 0.0,
  'lifegs': 0.0,
  'begumkarabeyx': 0.0,
  'avutulan': 0.0,