# _ML Attempt: Dec. 27, 2019_

In [1]:
%reload_ext autoreload
%autoreload 2
%matplotlib inline

In [2]:
import pandas as pd
pd.set_option("display.max_columns", None)

dtype = {
    "id_str": str
}

# load in verified training set
verified = pd.read_json(
    "json-data/verified_train.json",
    orient="split",
    dtype=dtype
)

# load in ira training set
ira = pd.read_json(
    "json-data/ira_train.json",
    orient="split",
    dtype=dtype
)

In [3]:
# info about verified data
verified.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168883 entries, 0 to 168882
Data columns (total 10 columns):
id_str            168883 non-null object
screen_name       168883 non-null object
created_at        168883 non-null datetime64[ns]
lang              168883 non-null object
source            168883 non-null object
retweet_count     168883 non-null int64
favorite_count    168883 non-null int64
full_text         168883 non-null object
clean_text        168883 non-null object
label             168883 non-null object
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 14.2+ MB


In [4]:
# info abour ira data
ira.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 1525311 entries, 0 to 1525310
Data columns (total 10 columns):
id_str            1525311 non-null object
screen_name       1525311 non-null object
created_at        1525311 non-null datetime64[ns]
lang              1525311 non-null object
source            1525311 non-null object
retweet_count     1525311 non-null int64
favorite_count    1525311 non-null int64
full_text         1525311 non-null object
clean_text        1525311 non-null object
label             1525311 non-null object
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 128.0+ MB


In [5]:
# take a sample of ira equal to the length of verified data
ira_sample = ira.sample(n=len(verified), random_state=1)

In [6]:
# get info of ira_sample
ira_sample.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 168883 entries, 1169189 to 923184
Data columns (total 10 columns):
id_str            168883 non-null object
screen_name       168883 non-null object
created_at        168883 non-null datetime64[ns]
lang              168883 non-null object
source            168883 non-null object
retweet_count     168883 non-null int64
favorite_count    168883 non-null int64
full_text         168883 non-null object
clean_text        168883 non-null object
label             168883 non-null object
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 14.2+ MB


In [12]:
# concat together ira_sample and verified data
combine = pd.concat([verified, ira_sample], ignore_index=True)

In [13]:
len(combine) == (len(ira_sample) + len(verified))

True

In [14]:
# get info from combined dataframe
combine.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 337766 entries, 0 to 337765
Data columns (total 10 columns):
id_str            337766 non-null object
screen_name       337766 non-null object
created_at        337766 non-null datetime64[ns]
lang              337766 non-null object
source            337766 non-null object
retweet_count     337766 non-null int64
favorite_count    337766 non-null int64
full_text         337766 non-null object
clean_text        337766 non-null object
label             337766 non-null object
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 25.8+ MB


In [16]:
# shuffle combined data set
shuffled = combine.sample(frac=1, random_state=1).copy()

## _Get 10% Sample of `combine` DataFrame & conduct initial ML experiment_

In [10]:
# get 10% sample of combined data set
train_sample = combine.sample(frac=0.10, random_state=1)

In [17]:
# reset index
train_sample = train_sample.reset_index(drop=True)

In [18]:
# check info on sample so far
train_sample.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 33777 entries, 0 to 33776
Data columns (total 10 columns):
id_str            33777 non-null object
screen_name       33777 non-null object
created_at        33777 non-null datetime64[ns]
lang              33777 non-null object
source            33777 non-null object
retweet_count     33777 non-null int64
favorite_count    33777 non-null int64
full_text         33777 non-null object
clean_text        33777 non-null object
label             33777 non-null object
dtypes: datetime64[ns](1), int64(2), object(7)
memory usage: 2.6+ MB


In [19]:
# replace real accounts with 0 and fake accounts with 1
train_sample["label"] = train_sample["label"].replace(to_replace={"real": 0, "fake": 1})

In [20]:
# divide into train/test split
from sklearn.model_selection import train_test_split

X_train, X_test, y_train, y_test = train_test_split(
    train_sample["clean_text"], 
    train_sample["label"],
    test_size=0.3,
    random_state=1
)

In [21]:
# build a simple pipeline
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
from sklearn.naive_bayes import MultinomialNB

In [22]:
text_clf = Pipeline([
    ("vect", CountVectorizer()),
    ("tfidf", TfidfTransformer()),
    ("clf", MultinomialNB())
])

In [23]:
# fit classifier
text_clf.fit(X_train, y_train)

Pipeline(memory=None,
         steps=[('vect',
                 CountVectorizer(analyzer='word', binary=False,
                                 decode_error='strict',
                                 dtype=<class 'numpy.int64'>, encoding='utf-8',
                                 input='content', lowercase=True, max_df=1.0,
                                 max_features=None, min_df=1,
                                 ngram_range=(1, 1), preprocessor=None,
                                 stop_words=None, strip_accents=None,
                                 token_pattern='(?u)\\b\\w\\w+\\b',
                                 tokenizer=None, vocabulary=None)),
                ('tfidf',
                 TfidfTransformer(norm='l2', smooth_idf=True,
                                  sublinear_tf=False, use_idf=True)),
                ('clf',
                 MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True))],
         verbose=False)

In [24]:
# create predictions
predicted = text_clf.predict(X_test)

In [25]:
import numpy as np

np.mean(predicted == y_test)

0.8056048944148412

In [26]:
predicted

array([0, 1, 0, ..., 0, 1, 1])

In [27]:
from sklearn.metrics import f1_score

f1_score(y_test, predicted)

0.7917987740435426

In [28]:
import matplotlib.pyplot as plt
from sklearn.metrics import classification_report

In [29]:
print(classification_report(y_test, predicted))

              precision    recall  f1-score   support

           0       0.77      0.88      0.82      5041
           1       0.86      0.74      0.79      5093

    accuracy                           0.81     10134
   macro avg       0.81      0.81      0.80     10134
weighted avg       0.81      0.81      0.80     10134

