## Convert Datset

Converts Russian Troll Bot dataset to train, test, and validation tsv files.  Albert expects this format.

In [2]:
# Import Russian Troll Tweets dataset

import glob
import pandas as pd

folder = "data/russian-troll-tweets-master"
russian_tweets = pd.concat([pd.read_csv(file, dtype='str') for file in glob.glob(folder + '/*.csv')])

russian_tweets = russian_tweets[russian_tweets['language'] == 'English']  # English tweets only.
russian_tweets = russian_tweets[pd.notnull(russian_tweets['content'])]  # drop null tweets

russian_tweets['publish_date'] = pd.to_datetime(russian_tweets['publish_date'])

print(russian_tweets.shape)
print(russian_tweets[russian_tweets['account_category'] == 'RightTroll'].iloc[0]['content'])
russian_tweets.keys()

(2116866, 21)
Come support our peace officers. Our choice is law & order. #BlueLivesMatter  Rally link:  https://t.co/PuKr7nBBxP https://t.co/QpCg8jRFkN


Index(['external_author_id', 'author', 'content', 'region', 'language',
       'publish_date', 'harvested_date', 'following', 'followers', 'updates',
       'post_type', 'account_type', 'retweet', 'account_category',
       'new_june_2018', 'alt_external_id', 'tweet_id', 'article_url',
       'tco1_step1', 'tco2_step1', 'tco3_step1'],
      dtype='object')

In [3]:
# Create date range dataframes, for comparing model training performance by year.

rt2015 = russian_tweets[russian_tweets['publish_date'].dt.year == 2015]
rt2016 = russian_tweets[russian_tweets['publish_date'].dt.year == 2016]
rt2017 = russian_tweets[russian_tweets['publish_date'].dt.year == 2017]

In [4]:
# Import Twitter Stream dataset

import pandas as pd

stream_tweets = pd.read_json("data/en_tweets.json", dtype='str')

# Replace newlines and tabs with spaces, because output is a tsv.
stream_tweets['text'].replace('\t', ' ', regex=True, inplace=True)
stream_tweets['text'].replace('\n', ' ', regex=True, inplace=True)

# Convert str type to date type.
stream_tweets['created_at'] = pd.to_datetime(stream_tweets['created_at'])

print(stream_tweets.shape)
print(stream_tweets[stream_tweets['lang'] == 'en'].count()[0])
print(stream_tweets['text'][0])
stream_tweets.keys()

(100000, 37)
100000
RT @beastieboys: “Wait, what!?! I just heard that Mike &amp; Adam made a movie about Beastie Boys with Spike Jonze. Is that for real?" - Larry…


Index(['created_at', 'id', 'id_str', 'text', 'source', 'truncated',
       'in_reply_to_status_id', 'in_reply_to_status_id_str',
       'in_reply_to_user_id', 'in_reply_to_user_id_str',
       'in_reply_to_screen_name', 'user', 'geo', 'coordinates', 'place',
       'contributors', 'retweeted_status', 'is_quote_status', 'quote_count',
       'reply_count', 'retweet_count', 'favorite_count', 'entities',
       'favorited', 'retweeted', 'filter_level', 'lang', 'timestamp_ms',
       'display_text_range', 'quoted_status_id', 'quoted_status_id_str',
       'quoted_status', 'quoted_status_permalink', 'possibly_sensitive',
       'extended_tweet', 'extended_entities', 'withheld_in_countries'],
      dtype='object')

In [8]:
# Create Train and Test Sets

import numpy as np

# Limit number of samples to keep training fast
TOTAL_NUM_SAMPLES = 100_000
CATEGORIES = 4
numb_samples = int(TOTAL_NUM_SAMPLES/CATEGORIES)

# Column names Albert expects
X_COLUMN = "text"
Y_COLUMN = "label"

# Create non-troll train,test,validate dataset
def create_nontroll_train_test_validate_sets(df):
    st = pd.DataFrame({X_COLUMN: df['text'], Y_COLUMN: pd.DataFrame(['NotTroll']*len(df['text']))[0]})
    st = st.sample(numb_samples)  # Keep training size small and evenly distributed.
    st.loc[st[Y_COLUMN] == 'NotTroll',Y_COLUMN] = 3  # Albert expects category names to be numbers.
    
    # Split data into train and test sets
    # 60% train, 20% validation, 20% test
    train, validate, test = np.split(st.sample(frac=1), [int(0.6*len(st)), int(0.8*len(st))])
    return train, validate, test

def create_russian_troll_train_test_validate_sets(df):
    """Used"""
    rt = pd.DataFrame({X_COLUMN: df['content'], Y_COLUMN: df['account_category']})
    
    # Select only trolls and news feeds
    left_troll = rt[rt[Y_COLUMN] == 'LeftTroll'].sample(numb_samples)
    right_troll = rt[rt[Y_COLUMN] == 'RightTroll'].sample(numb_samples)
    news_feed = rt[rt[Y_COLUMN] == 'NewsFeed'].sample(numb_samples)
    rt = pd.concat([left_troll, right_troll, news_feed])
    
    # Convert categories to ints. Albert expects ints.
    rt.loc[rt[Y_COLUMN] == 'LeftTroll',Y_COLUMN] = 0
    rt.loc[rt[Y_COLUMN] == 'RightTroll',Y_COLUMN] = 1
    rt.loc[rt[Y_COLUMN] == 'NewsFeed',Y_COLUMN] = 2
    
    # Split data into train and test sets
    # 60% train, 20% validation, 20% test
    train, validate, test = np.split(rt.sample(frac=1), [int(0.6*len(rt)), int(0.8*len(rt))])
    return train, validate, test


st_train, st_validate, st_test = create_nontroll_train_test_validate_sets(stream_tweets)
print(len(st_train))

rt_train, rt_validate, rt_test = create_russian_troll_train_test_validate_sets(russian_tweets)
print(len(rt_train))

rt2015_train, rt2015_validate, rt2015_test = create_russian_troll_train_test_validate_sets(rt2015)
rt2016_train, rt2016_validate, rt2016_test = create_russian_troll_train_test_validate_sets(rt2016)
rt2017_train, rt2017_validate, rt2017_test = create_russian_troll_train_test_validate_sets(rt2017)
print(len(rt2015_train))
print(len(rt2016_train))
print(len(rt2017_train))

15000
45000
45000
45000
45000


In [9]:
# Write to tab seperate value files.

from pathlib import Path
import csv

def write_to_tsv(df, path, filename):
    path = Path(path)
    
    # Make folder if it doesn't already exist.
    path.mkdir(parents=True, exist_ok=True)

    df.to_csv(str(path/filename)+'.tsv', sep='\t', index=False)

def write_train_validate_test_to_tsv(train, validate, test, path):
    # Shuffle up tweets, as Albert may read this in order, causing faulty training.
    train = train.sample(frac=1)
    validate = validate.sample(frac=1)
    test = test.sample(frac=1)
    
    write_to_tsv(train, path, "train")
    write_to_tsv(validate, path, "dev")
    write_to_tsv(test, path, "test")

    
# All years tweets (2015-2017)
write_train_validate_test_to_tsv(pd.concat([rt_train, st_train], ignore_index=True),
                                 pd.concat([rt_validate, st_validate], ignore_index=True),
                                 pd.concat([rt_test, st_test], ignore_index=True),
                                 'data/tweets')

# Trained on 2015 tweets, validated on 2015 tweets
write_train_validate_test_to_tsv(pd.concat([rt2015_train, st_train], ignore_index=True),
                                 pd.concat([rt2015_validate, st_validate], ignore_index=True),
                                 pd.concat([rt2015_test, st_test], ignore_index=True),
                                 'data/2015/tweets')

# Trained on 2015 tweets, validated on 2016 tweets
write_train_validate_test_to_tsv(pd.concat([rt2015_train, st_train], ignore_index=True),
                                 pd.concat([rt2016_validate, st_validate], ignore_index=True),
                                 pd.concat([rt2016_test, st_test], ignore_index=True),
                                 'data/2016/tweets')

# Trained on 2015 tweets, validated on 2017 tweets
write_train_validate_test_to_tsv(pd.concat([rt2015_train, st_train], ignore_index=True),
                                 pd.concat([rt2017_validate, st_validate], ignore_index=True),
                                 pd.concat([rt2017_test, st_test], ignore_index=True),
                                 'data/2017/tweets')

In [10]:
# Verify

from pathlib import Path
import pandas as pd
import csv

def read_tsv(path, filename):
    return pd.read_csv(str(Path(path)/filename)+'.tsv', sep='\t', index_col=0, dtype='str')

read_tsv("data/tweets", "train")
# read_tsv("data/2015/tweets", "train")
# read_tsv("data/2015/tweets", "dev")
# read_tsv("data/2015/tweets", "test")

Unnamed: 0_level_0,label
text,Unnamed: 1_level_1
Nationals' Cole suspended 5 games for throwing at Kang #sports,2
"RT Chet_Cannon: 'Gay sex-worker, 26, dies of meth overdose at Hollywood home of high-profile Democrat donor' … https://t.co/WtVAFIiJRV",1
"It's officially ""nah I'm good weather"". So if you invite me somewhere, just know I'm good lol",0
Rape Survivor Explains Why She Supports The Second Amendment [VIDEO] https://t.co/N6jxQsH59j,0
Chitown in DC. #westside https://t.co/d3hCWCuI6Q,0
...,...
That`s really funny http://t.co/voqXc623rY,0
Here Are Six GOP TRAITOR Senators Who Need To GO https://t.co/7hUwRQ3nSR https://t.co/mXGP5gYQe9,1
NYC ✈️ https://t.co/2ZukGPIX9Y,0
"PD: Person shot, killed near Findlay Playground https://t.co/odePF7CNR8 https://t.co/hYlzyi2zZM",2
