In [1]:
import pandas as pd
import numpy as np
import math
import time

from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import FunctionTransformer
from sklearn.decomposition import PCA
from sklearn.neighbors import KNeighborsClassifier

from nltk.lm.preprocessing import padded_everygram_pipeline, pad_both_ends
from nltk.util import trigrams
from nltk.lm import MLE

import warnings
warnings.filterwarnings('ignore')

In [2]:
df_train = pd.read_csv('../data/set-3/train/profile_info.csv').iloc[:, 1:]
df_label = pd.read_csv('../data/set-3/train/label.csv').iloc[:, 1:]
df_train = df_train.merge(df_label, on='ID')
df_train.head()

Unnamed: 0,ID,name,screen_name,location,description,url,protected,followers_count,friends_count,listed_count,...,profile_image_url_https,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,has_extended_profile,default_profile,default_profile_image,label
0,17461978,SHAQ,SHAQ,"Orlando, FL","VERY QUOTATIOUS, I PERFORM RANDOM ACTS OF SHAQ...",http://www.ShaqFuRadio.com,False,15349596,692,45568,...,https://pbs.twimg.com/profile_images/167390727...,2FC2EF,181A1E,252429,666666,True,False,False,False,0
1,1297437077403885568,Jennifer Fishpaw,JenniferFishpaw,,,,False,0,44,0,...,https://pbs.twimg.com/profile_images/129743740...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False,1
2,17685258,Brad Parscale,parscale,Florida,Owner @ Parscale Strategy. Senior Advisor Digi...,http://www.parscale.com,False,762839,475,3201,...,https://pbs.twimg.com/profile_images/129545322...,AB2316,FFFFFF,FFFFFF,666666,False,False,False,False,0
3,15750898,FOX 13 Tampa Bay,FOX13News,"Tampa, FL",Bringing you the important stuff like breaking...,http://www.FOX13news.com,False,327587,4801,1744,...,https://pbs.twimg.com/profile_images/129319301...,0B2F8A,FFFFFF,E8EEF0,333333,True,False,False,False,0
4,1659167666,Vonte The Plug 🎤🔌,VonteThePlugNC,"Jacksonville Beach, FL",MOTIVATION 3 OUT NOW 🔥 Singles: ‘Lil Shawdy’ &...,https://music.apple.com/us/artist/vonte-the-pl...,False,13324,647,44,...,https://pbs.twimg.com/profile_images/118166240...,1DA1F2,C0DEED,DDEEF6,333333,True,False,True,False,1


In [3]:
def get_screen_name_likelihood(series):
    sequence = series.apply(lambda x: list(x.lower())).values.tolist()
    return get_likelihood_array(sequence)
    
def get_likelihood_array(sequence, n=3):
    train_data, padded_sent = padded_everygram_pipeline(n, sequence)
    mle = MLE(n)
    mle.fit(train_data, padded_sent)
    
    s = np.zeros((len(sequence),))
    for i, name in enumerate(sequence):
        tri = trigrams(pad_both_ends(name, n=3))
        total_score = 1
        count = 0
        for ele in tri:
            score = mle.score(ele[2], [ele[0], ele[1]])
            total_score *= score
            count += 1
        s[i] = total_score ** (1/count)
    return s

def feature_engineering(df):
    used_columns = [
        'statuses_count',
        'followers_count',
        'friends_count',
        'favourites_count',
        'listed_count',
        'default_profile',
        'profile_use_background_image',
        'verified',
        'label'
    ]
    df_return = df[used_columns]
    age = (pd.to_datetime('2020-12-31 00:00:00') - pd.to_datetime(df['created_at']).dt.tz_localize(None)) / np.timedelta64(1, 'Y')
    df_return['tweet_freq'] = df['statuses_count'] / age
    df_return['followers_growth_rate'] = df['followers_count'] / age
    df_return['friends_growth_rate'] = df['friends_count'] / age
    df_return['favourites_growth_rate'] = df['favourites_count'] / age
    df_return['listed_growth_rate'] = df['listed_count'] / age
    df_return['followers_friends_ratio'] = df['followers_count'] / np.maximum(df['friends_count'], 1)
    df_return['screen_name_length'] = df['screen_name'].str.len()
    df_return['num_digits_in_screen_name'] = df['screen_name'].str.count('\d')
    df_return['name_length'] = df['name'].str.len()
    df_return['num_digits_in_name'] = df['name'].str.count('\d')
    df_return['description_length'] = df['description'].str.len()
    df_return['screen_name_likelihood'] = get_screen_name_likelihood(df['screen_name'])
    
    def bool_to_int(text):
        if 'True' in text:
            return 1
        elif 'False' in text:
            return 0
        else:
            return text
    for i in df_return.select_dtypes('object'):
        df_return[i] = df_return[i].apply(bool_to_int)
    
    return df_return.fillna(0.0)

In [4]:
feature_time = time.time()
df_new = feature_engineering(df_train)
end_feature_time = time.time()

In [5]:
df_new

Unnamed: 0,statuses_count,followers_count,friends_count,favourites_count,listed_count,default_profile,profile_use_background_image,verified,label,tweet_freq,...,friends_growth_rate,favourites_growth_rate,listed_growth_rate,followers_friends_ratio,screen_name_length,num_digits_in_screen_name,name_length,num_digits_in_name,description_length,screen_name_likelihood
0,9798,15349596,692,142,45568,0,1,1,0,808.630443,...,57.110866,11.719282,3760.734029,22181.497110,5,0,5,0,51,0.159735
1,0,0,44,7,0,1,1,0,1,0.000000,...,123.921859,19.714841,0.000000,0.000000,16,0,17,0,1,0.103708
2,5518,762839,475,953,3201,0,0,1,0,456.365283,...,39.284797,78.817708,264.738179,1605.976842,9,0,14,0,161,0.141913
3,192876,327587,4801,2946,1744,0,1,1,0,15553.281378,...,387.146684,237.561785,140.633997,68.233076,10,2,17,2,161,0.145170
4,103,13324,647,729,44,1,1,0,1,13.934062,...,87.527553,98.620690,5.952415,20.593509,15,0,18,0,104,0.073807
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
8273,25790,10997,821,20115,221,1,1,1,0,3474.254335,...,110.599566,2709.756726,29.771625,13.394641,14,0,12,0,161,0.096893
8274,425,44,637,298,0,1,1,0,1,89.160821,...,133.636337,62.517470,0.000000,0.069074,14,2,13,0,158,0.106675
8275,1439,342,849,2464,0,1,1,0,1,129.783248,...,76.571214,222.227883,0.000000,0.402827,15,0,16,0,134,0.120930
8276,674,72,367,2634,2,1,1,0,1,92.151543,...,50.177472,360.129323,0.273447,0.196185,9,3,8,0,66,0.110918


In [6]:
X = df_new.drop('label', axis=1).values
y = df_new['label'].values

In [7]:
start_time = time.time()
transformer = FunctionTransformer(np.log1p, validate=True)
X = transformer.transform(X)

decomposer = PCA(2)
X = decomposer.fit_transform(X)

model = KNeighborsClassifier(9)
model.fit(X, y)
end_time = time.time()

In [8]:
y_predict = model.predict(X)

In [9]:
print(classification_report(y_predict, y, digits=4))

              precision    recall  f1-score   support

           0     0.6286    0.7955    0.7022      2870
           1     0.8737    0.7506    0.8074      5408

    accuracy                         0.7661      8278
   macro avg     0.7511    0.7730    0.7548      8278
weighted avg     0.7887    0.7661    0.7710      8278



In [10]:
# ROC AUC score
roc_auc_score(y_predict, y)

0.773012558501536

In [11]:
# Training time and feature engineering time
end_time - start_time, end_feature_time - feature_time

(0.07233452796936035, 4.621851444244385)

In [12]:
df_test = pd.read_csv('../data/set-3/test/profile_info.csv').iloc[:, 1:]
df_label_test = pd.read_csv('../data/set-3/test/label.csv').iloc[:, 1:]
df_test = df_test.merge(df_label_test, on='ID')
df_test.head()

Unnamed: 0,ID,name,screen_name,location,description,url,protected,followers_count,friends_count,listed_count,...,profile_image_url_https,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,has_extended_profile,default_profile,default_profile_image,label
0,1188812492010487808,Sharon Israel ⭐️⭐️⭐️,SharonIsrael10,Los Angeles & Colorado,Day 1 Trump supporter. I rode the escalator! C...,,False,16596,16944,1,...,https://pbs.twimg.com/profile_images/118883642...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False,1
1,155659213,Cristiano Ronaldo,Cristiano,"Turim, Piemonte",This Privacy Policy addresses the collection a...,http://www.facebook.com/cristiano,False,87313765,50,83703,...,https://pbs.twimg.com/profile_images/115731332...,1643C9,FFFFFF,838387,0D0D0D,True,False,False,False,0
2,147725246,FoxNewsInsider,FoxNewsInsider,NYC,Stay connected with everything Fox - the lates...,http://insider.foxnews.com,False,161827,361,1471,...,https://pbs.twimg.com/profile_images/881932020...,0084B4,FFFFFF,DDEEF6,333333,True,False,False,False,0
3,1296248637194895360,El Realista,ElReali03271594,Puerto Rico,Aprendizaje. Pensamiento Crítico. Debate de id...,,False,9,543,0,...,https://pbs.twimg.com/profile_images/129624930...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False,1
4,1339835893,Hillary Clinton,HillaryClinton,"New York, NY","2016 Democratic Nominee, SecState, Senator, ha...",http://onwardtogether.org,False,28513011,846,40146,...,https://pbs.twimg.com/profile_images/129119233...,0057B8,000000,000000,000000,False,True,False,False,0


In [13]:
df_test = feature_engineering(df_test)

In [14]:
X = df_test.drop('label', axis=1).values
y = df_test['label'].values

In [15]:
transformer = FunctionTransformer(np.log1p, validate=True)
X = transformer.transform(X)

decomposer = PCA(2)
X = decomposer.fit_transform(X)

model = KNeighborsClassifier(9)
model.fit(X, y)

KNeighborsClassifier(n_neighbors=9)

In [16]:
y_predict = model.predict(X)

In [17]:
print(classification_report(y_predict, y, digits=4))

              precision    recall  f1-score   support

           0     0.6648    0.8280    0.7375       436
           1     0.8828    0.7564    0.8147       747

    accuracy                         0.7828      1183
   macro avg     0.7738    0.7922    0.7761      1183
weighted avg     0.8025    0.7828    0.7862      1183

