In [1]:
import pandas as pd
import numpy as np
import time

from sklearn.metrics import classification_report, roc_auc_score
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.feature_extraction.text import TfidfVectorizer

In [2]:
df_train = pd.read_csv('../data/set-3/train/profile_info.csv').iloc[:, 1:]
df_label = pd.read_csv('../data/set-3/train/label.csv').iloc[:, 1:]
df_train = df_train.merge(df_label, on='ID')
df_train.head()

Unnamed: 0,ID,name,screen_name,location,description,url,protected,followers_count,friends_count,listed_count,...,profile_image_url_https,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,has_extended_profile,default_profile,default_profile_image,label
0,17461978,SHAQ,SHAQ,"Orlando, FL","VERY QUOTATIOUS, I PERFORM RANDOM ACTS OF SHAQ...",http://www.ShaqFuRadio.com,False,15349596,692,45568,...,https://pbs.twimg.com/profile_images/167390727...,2FC2EF,181A1E,252429,666666,True,False,False,False,0
1,1297437077403885568,Jennifer Fishpaw,JenniferFishpaw,,,,False,0,44,0,...,https://pbs.twimg.com/profile_images/129743740...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False,1
2,17685258,Brad Parscale,parscale,Florida,Owner @ Parscale Strategy. Senior Advisor Digi...,http://www.parscale.com,False,762839,475,3201,...,https://pbs.twimg.com/profile_images/129545322...,AB2316,FFFFFF,FFFFFF,666666,False,False,False,False,0
3,15750898,FOX 13 Tampa Bay,FOX13News,"Tampa, FL",Bringing you the important stuff like breaking...,http://www.FOX13news.com,False,327587,4801,1744,...,https://pbs.twimg.com/profile_images/129319301...,0B2F8A,FFFFFF,E8EEF0,333333,True,False,False,False,0
4,1659167666,Vonte The Plug 🎤🔌,VonteThePlugNC,"Jacksonville Beach, FL",MOTIVATION 3 OUT NOW 🔥 Singles: ‘Lil Shawdy’ &...,https://music.apple.com/us/artist/vonte-the-pl...,False,13324,647,44,...,https://pbs.twimg.com/profile_images/118166240...,1DA1F2,C0DEED,DDEEF6,333333,True,False,True,False,1


In [5]:
def tfidf(series, ngram_range):
    text = series.values.reshape(-1)
    vectorizer = TfidfVectorizer(analyzer='char', ngram_range=ngram_range, max_features=500)
    m = vectorizer.fit_transform(text).todense()
    vocab = vectorizer.get_feature_names()
    return m, vocab

def string_entropy(string):
    char_dict = {}
    for i in string:
        if i not in char_dict:
            char_dict[i] = 0
        char_dict[i] += 1
    char_count = np.array(list(char_dict.values()))
    char_count = char_count / char_count.sum()
    entropy = (-char_count * np.log2(char_count)).sum()
    return entropy

def feature_engineering(df):
    m, vocab = tfidf(df['screen_name'].str.lower(), ngram_range=(3,3))
    df_return = pd.DataFrame(m, columns=vocab)
    df_return['entropy'] = df['screen_name'].apply(string_entropy)
    df_return['uppercase'] = df['screen_name'].str.count(r'[A-Z]') / df['screen_name'].str.len()
    df_return['lowercase'] = df['screen_name'].str.count(r'[a-z]') / df['screen_name'].str.len()
    return df_return

In [6]:
feature_time = time.time()
df_new = feature_engineering(df_train)
end_feature_time = time.time()

In [7]:
X = df_new.values
y = df_train['label'].values

In [8]:
start_time = time.time()
model = LogisticRegression()
model.fit(X, y)
end_time = time.time()

STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


In [9]:
y_predict = model.predict(X)

In [10]:
print(classification_report(y_predict, y, digits=4))

              precision    recall  f1-score   support

           0     0.4218    0.6170    0.5011      2483
           1     0.7953    0.6376    0.7078      5795

    accuracy                         0.6314      8278
   macro avg     0.6086    0.6273    0.6044      8278
weighted avg     0.6833    0.6314    0.6458      8278



In [11]:
# ROC AUC score
roc_auc_score(y_predict, y)

0.6273071033154876

In [12]:
# Training time and feature engineering time
end_time - start_time, end_feature_time - feature_time

(0.6473333835601807, 0.3842012882232666)

In [13]:
df_test = pd.read_csv('../data/set-3/test/profile_info.csv').iloc[:, 1:]
df_label_test = pd.read_csv('../data/set-3/test/label.csv').iloc[:, 1:]
df_test = df_test.merge(df_label_test, on='ID')
df_test.head()

Unnamed: 0,ID,name,screen_name,location,description,url,protected,followers_count,friends_count,listed_count,...,profile_image_url_https,profile_link_color,profile_sidebar_border_color,profile_sidebar_fill_color,profile_text_color,profile_use_background_image,has_extended_profile,default_profile,default_profile_image,label
0,1188812492010487808,Sharon Israel ⭐️⭐️⭐️,SharonIsrael10,Los Angeles & Colorado,Day 1 Trump supporter. I rode the escalator! C...,,False,16596,16944,1,...,https://pbs.twimg.com/profile_images/118883642...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False,1
1,155659213,Cristiano Ronaldo,Cristiano,"Turim, Piemonte",This Privacy Policy addresses the collection a...,http://www.facebook.com/cristiano,False,87313765,50,83703,...,https://pbs.twimg.com/profile_images/115731332...,1643C9,FFFFFF,838387,0D0D0D,True,False,False,False,0
2,147725246,FoxNewsInsider,FoxNewsInsider,NYC,Stay connected with everything Fox - the lates...,http://insider.foxnews.com,False,161827,361,1471,...,https://pbs.twimg.com/profile_images/881932020...,0084B4,FFFFFF,DDEEF6,333333,True,False,False,False,0
3,1296248637194895360,El Realista,ElReali03271594,Puerto Rico,Aprendizaje. Pensamiento Crítico. Debate de id...,,False,9,543,0,...,https://pbs.twimg.com/profile_images/129624930...,1DA1F2,C0DEED,DDEEF6,333333,True,True,True,False,1
4,1339835893,Hillary Clinton,HillaryClinton,"New York, NY","2016 Democratic Nominee, SecState, Senator, ha...",http://onwardtogether.org,False,28513011,846,40146,...,https://pbs.twimg.com/profile_images/129119233...,0057B8,000000,000000,000000,False,True,False,False,0


In [14]:
df_test_new = feature_engineering(df_test)

In [15]:
X = df_test_new.values
y = df_test['label'].values

In [16]:
y_predict = model.predict(X)

In [17]:
print(classification_report(y_predict, y, digits=4))

              precision    recall  f1-score   support

           0     0.3112    0.5434    0.3958       311
           1     0.7781    0.5711    0.6587       872

    accuracy                         0.5638      1183
   macro avg     0.5447    0.5573    0.5273      1183
weighted avg     0.6554    0.5638    0.5896      1183

