In [2]:
%load_ext autoreload
%autoreload 2
%matplotlib inline

import re

import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
import matplotlib
import numpy as np
matplotlib.rcParams['figure.figsize'] = [10, 6]

## Crazy Data Loading (bad csv file)

In [3]:
with open("../data/trump_twitter_20150616_20190819.csv", 'r') as f:
    lines = f.readlines()[::-1]
    
combined_counter = 0
parsed_lines = []
for line in lines[:-1]:
    output_line = line.strip().split("|")
    
    # Fix bad splits
    if len(output_line) > 2:
        output_line = ["|".join(output_line[:-1]), output_line[-1]]
    
    # Combine multipart messages
    if re.findall(r"^\.{2,}", output_line[0]) and re.findall(r"\.{2,}$", parsed_lines[-1][0]):
        first_cleaned = re.sub("^\.{2,}", " ", parsed_lines[-1][0])
        second_cleaned = re.sub("^\.{2,}", "", output_line[0])
        parsed_lines[-1][0] = first_cleaned + second_cleaned
        combined_counter += 1
        continue
        
    parsed_lines.append(output_line)
    
url_regex = r"(https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|www\.[a-zA-Z0-9][a-zA-Z0-9-]+[a-zA-Z0-9]\.[^\s]{2,}|https?:\/\/(?:www\.|(?!www))[a-zA-Z0-9]+\.[^\s]{2,}|www\.[a-zA-Z0-9]+\.[^\s]{2,})"
    
twitter_df = (
    pd.DataFrame({
        "created_at": [x[1] for x in parsed_lines],
        "text": [x[0] for x in parsed_lines]
    })
    .assign(created_at=lambda x: pd.to_datetime(x.created_at, infer_datetime_format=True))
    .assign(text=lambda x: x.text.str.replace("&amp;", "&").str.replace(url_regex, "", regex=True))
)

startswith_dots = twitter_df.text.apply(lambda x: len(re.findall("^[\.]{2,}", x))) > 0
endswith_dots = twitter_df.text.apply(lambda x: len(re.findall("[\.]{2,}$", x))) > 0

twitter_df = twitter_df[~startswith_dots & ~endswith_dots]

print(f"Combined {combined_counter} multipart messages")
print(f"Threw away {sum(startswith_dots) + sum(endswith_dots)} multipart messages without match")

twitter_df.head()

Combined 686 multipart messages
Threw away 184 multipart messages without match


Unnamed: 0,created_at,text
0,2015-06-16 12:20:40,Big time in U.S. today - MAKE AMERICA GREAT AG...
1,2015-06-16 12:24:41,Thanks.
2,2015-06-16 13:04:05,It is almost time. I will be making a major an...
3,2015-06-16 13:07:50,Make sure to follow me on @periscopeco #MakeAm...
4,2015-06-16 14:01:13,In one hour I will be making a major announcem...


## Apply Pretrained Classifier

In [6]:
import joblib
from sklearn.base import BaseEstimator, TransformerMixin

def identity(value):
    return value

class NERFeatures(BaseEstimator, TransformerMixin):
    def __init__(self, spacy_model):
        self.spacy_model = spacy_model
        self.nlp = spacy.load(spacy_model, disable=["parser", "ner"])
    
    def fit(self, X, y=None):
        return self 
    
    def transform(self, X, y=None):
        return [[token.tag_ for token in doc] for doc in self.nlp.pipe(X)]
    
class ColumnSelector(BaseEstimator, TransformerMixin):
    def __init__(self, key):
        self.key = key

    def fit(self, x, y=None):
        return self

    def transform(self, data_dict):
        return data_dict[self.key]

clf = joblib.load("../models/trump_classifer_095AUC.pkl")

In [7]:
real_trump_ind = clf.predict(twitter_df)
real_twitter_df = twitter_df[real_trump_ind]

In [8]:
print(f"Realtrump rows: {len(real_twitter_df)}")
print(f"Percentage of original real: {len(real_twitter_df)/len(twitter_df):.2f}")

Realtrump rowsl: 6332
Percentage of original real: 0.51


In [14]:
for msg in real_twitter_df.text[:5]:
    print(msg,"\n")

Big time in U.S. today - MAKE AMERICA GREAT AGAIN! Politicians are all talk and no action - they can never bring us back. 

I am officially running for President of the United States. #MakeAmericaGreatAgain  

@ericbolling, in addition,no doubt you would have been amazing on @ApprenticeNBC! Keep up the great work. 

Trump Int'l Hotel & Tower, Chicago, has received accolades for design, service & our signature restaurant, "Sixteen"  

Just watched Brian Williams on @TODAYshow - very sad! Brian should get on with a new life and not start all over at @msnbc. Stop apologizing 



In [15]:
with open("../output/trumpbot_training_data.txt", 'w') as f:
    f.writelines("\n".join(real_twitter_df.text.tolist()))