# _ML Experimentation: Dec. 30, 2019_

**Purpose**: Continue exploration of how to best apply machine learning to the data set. I'll continue working with a sample of the all the data, which will also include different arrangements to the fake/real account split percentage.

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option("display.max_columns", None)

dtype = {
    "id_str": str
}

# load in verified training set
verified = pd.read_json(
    "json-data/verified_train.json",
    orient="split",
    dtype=dtype
)

# load in ira training set
ira = pd.read_json(
    "json-data/ira_train.json",
    orient="split",
    dtype=dtype
)

In [3]:
# info from Verified user data
verified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168883 entries, 0 to 168882
Data columns (total 10 columns):
id_str            168883 non-null object
screen_name       168883 non-null object
created_at        168883 non-null datetime64[ns]
lang              168883 non-null object
source            168883 non-null object
retweet_count     168883 non-null int64
favorite_count    168883 non-null int64
full_text         168883 non-null object
clean_text        168883 non-null object
label             168883 non-null object
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 14.2+ MB


In [4]:
# info from IRA data
ira.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1525311 entries, 0 to 1525310
Data columns (total 10 columns):
id_str            1525311 non-null object
screen_name       1525311 non-null object
created_at        1525311 non-null datetime64[ns]
lang              1525311 non-null object
source            1525311 non-null object
retweet_count     1525311 non-null int64
favorite_count    1525311 non-null int64
full_text         1525311 non-null object
clean_text        1525311 non-null object
label             1525311 non-null object
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 128.0+ MB


In [5]:
# take sample of IRA data = to length of Verified data
ira_sample = ira.sample(n=len(verified), random_state=1)

In [6]:
# get info on sample
ira_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168883 entries, 1169189 to 923184
Data columns (total 10 columns):
id_str            168883 non-null object
screen_name       168883 non-null object
created_at        168883 non-null datetime64[ns]
lang              168883 non-null object
source            168883 non-null object
retweet_count     168883 non-null int64
favorite_count    168883 non-null int64
full_text         168883 non-null object
clean_text        168883 non-null object
label             168883 non-null object
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 14.2+ MB


In [7]:
# concat together IRA sample and Verified data
combine = pd.concat([verified, ira_sample], ignore_index=True)

# check to confirm that combined data set is = IRA + Verified
print(len(combine) == (len(ira_sample) + len(verified)))

True


In [8]:
# convert screen_name column to categorical type
combine["screen_name"] = combine["screen_name"].astype("category")

# convert source column to categorical type
combine["source"] = combine["source"].astype("category")

In [9]:
# get info on Combined Dataframe
combine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337766 entries, 0 to 337765
Data columns (total 10 columns):
id_str            337766 non-null object
screen_name       337766 non-null category
created_at        337766 non-null datetime64[ns]
lang              337766 non-null object
source            337766 non-null category
retweet_count     337766 non-null int64
favorite_count    337766 non-null int64
full_text         337766 non-null object
clean_text        337766 non-null object
label             337766 non-null object
dtypes: category(2), datetime64[ns](1), int64(2), object(5)
memory usage: 22.0+ MB


In [10]:
# shuffle combined data set (which will help with sampling from this data set for ML development)
shuffled = combine.sample(frac=1, random_state=1).copy()

# reset index of shuffled data set
shuffled = shuffled.reset_index(drop=True)

# print out first few rows
shuffled[:3]

Unnamed: 0,id_str,screen_name,created_at,lang,source,retweet_count,favorite_count,full_text,clean_text,label
0,561424233624915968,DailyLosAngeles,2015-01-31 07:22:00,en,twitterfeed,0,0,San Francisco police detain 2 in dismembered b...,San Francisco police detain 2 in dismembered b...,fake
1,1159709950018539520,MeekMill,2019-08-09 06:16:13,en,Twitter for iPhone,4,25,@PrimeVideo then call it a night!! https://t.c...,PrimeVideo then call it a night,real
2,68709491461464064,ActuallyNPH,2011-05-12 16:09:53,en,Twitter Web Client,99,18,"NYers: I'm trying to find a simple, furnished ...",NYers Im trying to find a simple furnished sub...,real


In [11]:
shuffled.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337766 entries, 0 to 337765
Data columns (total 10 columns):
id_str            337766 non-null object
screen_name       337766 non-null category
created_at        337766 non-null datetime64[ns]
lang              337766 non-null object
source            337766 non-null category
retweet_count     337766 non-null int64
favorite_count    337766 non-null int64
full_text         337766 non-null object
clean_text        337766 non-null object
label             337766 non-null object
dtypes: category(2), datetime64[ns](1), int64(2), object(5)
memory usage: 22.0+ MB


In [21]:
# convert shuffled to JSON so that we can simply load in JSON file for future ML development
shuffled.to_json("json-data/ml_sample20.json", orient="split")

## _Take 20% Sample & Experimental ML_

In [12]:
# get 20% of combined data set
train_sample = combine.sample(frac=0.2, random_state=1)

In [13]:
# reset the index
train_sample = train_sample.reset_index(drop=True)

In [14]:
train_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 67553 entries, 0 to 67552
Data columns (total 10 columns):
id_str            67553 non-null object
screen_name       67553 non-null category
created_at        67553 non-null datetime64[ns]
lang              67553 non-null object
source            67553 non-null category
retweet_count     67553 non-null int64
favorite_count    67553 non-null int64
full_text         67553 non-null object
clean_text        67553 non-null object
label             67553 non-null object
dtypes: category(2), datetime64[ns](1), int64(2), object(5)
memory usage: 4.5+ MB


In [15]:
# replace values in label column with 0 for real accounts and 1 for fake accounts
train_sample["label"] = train_sample["label"].replace(to_replace={"real": 0, "fake": 1})

In [16]:
from sklearn.model_selection import train_test_split

# split data sample into train and test sets
X_train, X_test, y_train, y_test = train_test_split(
    train_sample["clean_text"],
    train_sample["label"],
    test_size=0.3,
    random_state=1
)

In [17]:
# libraries to help us build a function that takes in varying ML algortihms and test their performance
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC
from sklearn.experimental import enable_hist_gradient_boosting
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score

In [18]:
# create function that takes in an classifier, fits it to to data, and produces a metric score for performance
# assessment
def ml_test(x_train, y_train, x_test, y_test, clf):
    """
    Function that tests the performance of classifier algorithms, given the input data via train and test
    data.
    """
    # define classifier
    text_clf = Pipeline([
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", clf)
    ])
    # fit the classifier to data
    text_clf.fit(x_train, y_train)
    # predictions on on test set
    y_pred = text_clf.predict(x_test)
    # get F1 score
    f1 = f1_score(y_test, y_pred)
    # returns the F1 Score
    return f1

In [19]:
from tqdm import tqdm
from pprint import pprint
from time import sleep

naive_bayes = MultinomialNB()
svc = LinearSVC(random_state=1)
rf = RandomForestClassifier(random_state=1, n_estimators=10, n_jobs=-1)
lr = LogisticRegression(random_state=1, solver="lbfgs", max_iter=200)
ada = AdaBoostClassifier(random_state=1)
hist = HistGradientBoostingClassifier(random_state=1)

for clf in tqdm([naive_bayes, svc, lr, rf, ada]):
    pprint(str(clf).split("(")[0])
    sleep(1)

  0%|          | 0/5 [00:00<?, ?it/s]

'MultinomialNB'


 20%|██        | 1/5 [00:01<00:04,  1.00s/it]

'LinearSVC'


 40%|████      | 2/5 [00:02<00:03,  1.00s/it]

'LogisticRegression'


 60%|██████    | 3/5 [00:03<00:02,  1.00s/it]

'RandomForestClassifier'


 80%|████████  | 4/5 [00:04<00:01,  1.00s/it]

'AdaBoostClassifier'


100%|██████████| 5/5 [00:05<00:00,  1.00s/it]


In [63]:
for clf in tqdm([naive_bayes, svc, lr, rf, ada]):
    f1 = round(ml_test(X_train, y_train, X_test, y_test, clf), 4)
    clf_name = str(clf).split("(")[0]
    print(f"F1 Score for {clf_name}: {f1}")



  0%|          | 0/5 [00:00<?, ?it/s][A[A

 20%|██        | 1/5 [00:01<00:04,  1.17s/it][A[A

F1 Score for MultinomialNB: 0.8276




 40%|████      | 2/5 [00:02<00:03,  1.30s/it][A[A

F1 Score for LinearSVC: 0.8508




 60%|██████    | 3/5 [00:05<00:03,  1.60s/it][A[A

F1 Score for LogisticRegression: 0.837




 80%|████████  | 4/5 [00:10<00:02,  2.74s/it][A[A

F1 Score for RandomForestClassifier: 0.8019




100%|██████████| 5/5 [00:16<00:00,  3.31s/it][A[A

F1 Score for AdaBoostClassifier: 0.7493





In [20]:
from sklearn.model_selection import cross_val_score

# function that's similar to ml_test but with cross validation
def cv_ml_test(X, y, clf):
    """
    Function that takes variable and label data and performes 5-fold cross-validation with specified text classifier algorithm.
    """
    # define the classifier
    clf = Pipeline([
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", clf)
    ])
    # perform cross_validation and return f1 scores
    scores = cross_val_score(clf, X, y, cv=5, scoring="f1_macro")
    # print out 95% confidence interval of the score estimate
    return str("Accuracy: %0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [67]:
# loop through assorted algorithms and get their respective CV metric scores
for clf in tqdm([naive_bayes, svc, rf, lr, ada]):
    scores = cv_ml_test(train_sample["clean_text"], train_sample["label"], clf)
    clf_name = str(clf).split("(")[0]
    print(f"CV F1 Score for {clf_name}: {scores}")



  0%|          | 0/5 [00:00<?, ?it/s][A[A

 20%|██        | 1/5 [00:05<00:23,  5.78s/it][A[A

CV F1 Score for MultinomialNB: Accuracy: 0.84 (+/- 0.01)




 40%|████      | 2/5 [00:13<00:19,  6.37s/it][A[A

CV F1 Score for LinearSVC: Accuracy: 0.85 (+/- 0.01)




 60%|██████    | 3/5 [00:48<00:30, 15.01s/it][A[A

CV F1 Score for RandomForestClassifier: Accuracy: 0.82 (+/- 0.01)




 80%|████████  | 4/5 [00:59<00:13, 13.76s/it][A[A

CV F1 Score for LogisticRegression: Accuracy: 0.84 (+/- 0.01)




100%|██████████| 5/5 [01:34<00:00, 18.85s/it][A[A

CV F1 Score for AdaBoostClassifier: Accuracy: 0.74 (+/- 0.01)





In [49]:
str("Accuracy: %0.2f (+/- %0.2f)" % (98, 1.5 * 2))

'Accuracy: 98.00 (+/- 3.00)'

## _`Yellowbrick` Model Evaluation_

In [27]:
from sklearn.svm import LinearSVC, NuSVC, SVC
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import BaggingClassifier, RandomForestClassifier, AdaBoostClassifier, HistGradientBoostingClassifier

In [28]:
naive_bayes = MultinomialNB()
svc = LinearSVC(random_state=1)
rf = RandomForestClassifier(random_state=1, n_estimators=10, n_jobs=-1)
lr = LogisticRegression(random_state=1, solver="lbfgs", max_iter=200)
ada = AdaBoostClassifier(random_state=1)
hist = HistGradientBoostingClassifier(random_state=1)

# create list of models
models = [
    naive_bayes,
    svc,
    lr,
    rf,
    ada
]

def test_models(X, y, model):
    """
    Test various estimators with 5-fold CV, and retrieve F1 Score.
    """
    # define the classifier
    clf = Pipeline([
        ("vect", CountVectorizer()),
        ("tfidf", TfidfTransformer()),
        ("clf", model)
    ]) 
    # perform cross_validation and return f1 scores
    scores = cross_val_score(clf, X, y, cv=5, scoring="f1", n_jobs=-1)
    # print out 95% confidence interval of the score estimate
    return str("%0.2f (+/- %0.2f)" % (scores.mean(), scores.std() * 2))

In [30]:
for model in tqdm(models):
    scores = test_models(train_sample["clean_text"], train_sample["label"], model)
    clf_name = str(model).split("(")[0]
    print(f"CV F1 Score for {clf_name}: {scores}")





  0%|          | 0/5 [00:00<?, ?it/s][A[A[A[A



 20%|██        | 1/5 [00:02<00:09,  2.46s/it][A[A[A[A

CV F1 Score for MultinomialNB: 0.83 (+/- 0.01)






 40%|████      | 2/5 [00:05<00:08,  2.71s/it][A[A[A[A

CV F1 Score for LinearSVC: 0.86 (+/- 0.00)






 60%|██████    | 3/5 [00:10<00:06,  3.34s/it][A[A[A[A

CV F1 Score for LogisticRegression: 0.84 (+/- 0.01)






 80%|████████  | 4/5 [00:39<00:11, 11.11s/it][A[A[A[A

CV F1 Score for RandomForestClassifier: 0.81 (+/- 0.01)






100%|██████████| 5/5 [00:54<00:00, 10.95s/it][A[A[A[A

CV F1 Score for AdaBoostClassifier: 0.75 (+/- 0.01)



