In [1]:
import pandas as pd
import numpy as np

Bring in the datasets (Note: `pol_tweet_data` and `troll_tweet_data` are very large)

In [2]:
pol_tweet_data = pd.read_csv('./datasets_large/politician-tweets/pol_tweets.csv', sep=';')

In [3]:
troll_tweet_data = pd.read_csv('./datasets_large/russian-troll-tweets/combined.csv',
                               dtype={'external_author_id': str, 'author':str, 'content':str, 'region': str, 
                                      'language': str, 'publish_date':str, 'harvested_date':str, 'following':int, 
                                      'followers':int, 'post_type':str, 'account_type':str, 'retweet':bool, 
                                      'account_category':str, 'new_june_2018':bool, 'alt_external_id':str, 
                                      'tweet_id': str, 'article_url': str, 'tco1_step1':str, 'tco2_step1':str, 
                                      'tco3_step1': str })

In [4]:
news_tweet_data = pd.read_csv("./datasets_large/news-tweets/news_tweets.csv", sep=";",
                          dtype={'tweet_id':str, 'author_id':str, 'publish_date':str, 
                                'content':str, 'link_url':str, 'account_category':str, 
                                'author':str, 'account_type':str})

In [5]:
pol_authors = pd.read_csv("./datasets_large/politician-tweets/pol_accounts.csv", sep=";")

In [6]:
print("Troll tweets size:", troll_tweet_data.shape)
print("Politician tweets size: ", pol_tweet_data.shape)
print("News tweets size:", news_tweet_data.shape)

Troll tweets size: (2946207, 21)
Politician tweets size:  (1661553, 10)
News tweets size: (41772, 8)


Need to downsample the troll and politician tweets for efficency and to reduce class imbalance later on. 

In [7]:
# Down sample number of tweets because there are so many more than news tweets
# This improves efficency for processing and will help prevent class imbalances in our model
troll_tweet_data = troll_tweet_data.sample(n=200000, random_state=42)
pol_tweet_data = pol_tweet_data.sample(n=200000, random_state=42)

Now we need to drop some of the features and rename others. Since the News Tweets have already been formatted, we'll use that as a template. 

In [8]:
news_tweet_data.columns

Index(['tweet_id', 'author_id', 'publish_date', 'content', 'link_url',
       'account_category', 'author', 'account_type'],
      dtype='object')

In [9]:
troll_tweet_data.columns

Index(['external_author_id', 'author', 'content', 'region', 'language',
       'publish_date', 'harvested_date', 'following', 'followers', 'updates',
       'post_type', 'account_type', 'retweet', 'account_category',
       'new_june_2018', 'alt_external_id', 'tweet_id', 'article_url',
       'tco1_step1', 'tco2_step1', 'tco3_step1'],
      dtype='object')

In [10]:
pol_tweet_data.columns

Index(['id', 'user_id', 'created_at', 'tweet_text', 'hashtag_entities',
       'url_entities', 'favorites_count', 'retweet_count', 'quoted_status_id',
       'in_reply_to_status_id'],
      dtype='object')

In [11]:
troll_tweet_data.drop(['region', 'language','harvested_date', 'following', 'followers',
                       'updates', 'new_june_2018', 'alt_external_id','tco1_step1', 
                       'tco2_step1', 'tco3_step1', 'retweet', 'post_type', 'article_url'],
                      axis=1, inplace=True)

In [12]:
pol_tweet_data.drop(['hashtag_entities','favorites_count', 'retweet_count', 
                     'quoted_status_id','in_reply_to_status_id', 'url_entities'],
                   axis=1, inplace=True)

In [13]:
troll_tweet_data.rename(index=str, columns={'external_author_id':'author_id', 'article_url':'link_url'}, inplace=True)

In [14]:
pol_tweet_data.rename(index=str, columns={'id':'tweet_id', 'user_id':'author_id', 
                                          'created_at':'publish_date', 'tweet_text':'content', 
                                          'url_entities':'link_url'}, inplace=True)

In [15]:
print("News Tweets Columns:", news_tweet_data.columns)
print("Troll Tweets Columns:", troll_tweet_data.columns)
print("Politician Tweets Columns:", pol_tweet_data.columns)

News Tweets Columns: Index(['tweet_id', 'author_id', 'publish_date', 'content', 'link_url',
       'account_category', 'author', 'account_type'],
      dtype='object')
Troll Tweets Columns: Index(['author_id', 'author', 'content', 'publish_date', 'account_type',
       'account_category', 'tweet_id'],
      dtype='object')
Politician Tweets Columns: Index(['tweet_id', 'author_id', 'publish_date', 'content'], dtype='object')


Need to do some feature engineering and deal with useless data. For the `troll_tweet_data`, I want only the tweets from trolls (the dataset included tweets from authors not linked to Russian troll groups such as non-Russian accounts, meme pages, paid news). We will drop everything from that dataset that isnt a troll. We will also rename the category to get rid of their leaning in the `account_category` feature to simplify categorization. Their leaning will be included in the `account_type` feature.

For the polician tweets, we need to grab the author handles and their political lean from the author dataframe. We also set the category to politician.

For all three dataframes, we will remove non-english tweets. These also include tweets with nothing but emojis or urls. 

In [16]:
troll_tweet_data.loc[troll_tweet_data['account_category'] == "RightTroll", 'account_category'] = "Troll"
troll_tweet_data.loc[troll_tweet_data['account_category'] == "LeftTroll", 'account_category'] = "Troll"

In [17]:
troll_tweet_data['account_category'].value_counts()

Troll           77171
NonEnglish      55706
NewsFeed        40999
HashtagGamer    16308
Commercial       8167
Unknown           885
Fearmonger        764
Name: account_category, dtype: int64

In [18]:
troll_tweet_data = troll_tweet_data.loc[troll_tweet_data['account_category'] == "Troll"]

In [19]:
pol_tweet_data["account_category"] = "Politician"

In [20]:
# Get the handle name from the authors dataframe
def grab_authors(x):
    return(pol_authors.loc[pol_authors['id'] == x['author_id'],'screen_name'].values[0])

In [21]:
# Grab the political stance from the authors dataframe
def grab_party(x):
    if "democrat" in pol_authors.loc[pol_authors['id'] == x['author_id'],'array_agg'].values[0].lower():
        return("Left")
    elif "republican" in pol_authors.loc[pol_authors['id'] == x['author_id'],'array_agg'].values[0].lower():
        return("Right")
    else:
        return("Other")

In [22]:
# Need to fill in author and account type (left, right), for political tweets
pol_tweet_data['author'] = pol_tweet_data.apply(grab_authors, axis=1)

In [23]:
pol_tweet_data['account_type'] = pol_tweet_data.apply(grab_party, axis=1)

In [24]:
#all_tweet_data = pd.concat([pol_tweet_data, troll_tweet_data], sort=False)

In [25]:
#news_tweet_data = pd.read_csv('datasets/news-tweets/news_tweets.csv', sep=';')
#all_tweet_data = pd.concat([all_tweet_data, news_tweet_data], sort=False)

In [26]:
#all_tweet_data.to_csv('datasets/modified_data/combined_tweets.csv', index=False, sep=";")

In [27]:
# Need to drop non-english
from langdetect import detect
def detect_english(text):
    try:
        lang = detect(text)
    except:
        lang = "None"
        #print(text)
    
    if lang == "en":
        return(1)
    else:
        return(0)

In [28]:
news_tweet_data['is_eng'] = news_tweet_data.content.apply(detect_english)
news_tweet_data = news_tweet_data.loc[news_tweet_data.is_eng == 1]

https://t.co/r4Crcskcgv
https://t.co/rPUUe0WPZT
https://t.co/wcEDpmYHev
https://t.co/WffaKBJldo
https://t.co/61Zi9imHEf
https://t.co/PVfkMzJQYm
https://t.co/ZZGFozXXE6
https://t.co/hYFFJctpd4
https://t.co/6TNKIuMNYi
https://t.co/BDe3IihErD
https://t.co/rTD8HHu3YS
https://t.co/mZf8bcKCNN
https://t.co/1Ab0b25clS
https://t.co/ZFRjXY913h
üòè https://t.co/ZLpSWDtfUn
üî•üî•üî• https://t.co/m8KRDiiyZl
https://t.co/jwi56YIpwt https://t.co/zVYBUEmt63
ü§´
https://t.co/ivXQVRb3TB
https://t.co/D8iCUWFUyF https://t.co/Zprd6a4Lvw
üò¢ https://t.co/oeKF4FLzeT
https://t.co/sKfXgIe96y
https://t.co/jsLrVcusQr
https://t.co/2El6yFErcO
https://t.co/0U1gFBYrSO
https://t.co/RlsymtfwaA
https://t.co/KPMw1nn6cw
https://t.co/G2jzigT7rH
https://t.co/Gw6qxBscoJ
https://t.co/iDAaxMDrvW
https://t.co/6D0ptaxVMN
üèÜ üèÜ üèÜ  https://t.co/G4eeWOnsO4
https://t.co/6DXdhl57zS
https://t.co/gQKhpyNRdO
https://t.co/goZwJFA6wU
https://t.co/hJrDAQqsuY
https://t.co/w3IgCuIBat
https://t.co/zD8nmQI2ez
https://t.co/MxRwhxc

In [29]:
troll_tweet_data['is_eng'] = troll_tweet_data.content.apply(detect_english)
troll_tweet_data = troll_tweet_data.loc[troll_tweet_data.is_eng == 1]

https://t.co/SKZOJos6C4
https://t.co/FB0e6ZQeDX
üíØüíØ https://t.co/cTUKrwzirz
https://t.co/BeFjQsJnQ6
http://t.co/TQI3VboOKN
https://t.co/jDtYx9h9oT
https://t.co/pRiPpsc5hZ
https://t.co/Le1ZRH80Bd
https://t.co/Bv9PjCezJK
https://t.co/aLxCVIfR2m
https://t.co/QWAWSQ071D
https://t.co/Zvhj8kXNFD
!! https://t.co/foO1ttyDjH
https://t.co/GzIGOYu6VV
https://t.co/pSR5Iabfnc
https://t.co/KMP8gc1Z1i
https://t.co/VSlI8clmwm
https://t.co/SB5Qf1qRCA
https://t.co/h86L0Pci7p
58. https://t.co/Cz3MO5idbK
https://t.co/gCUC2jKwa1
https://t.co/NZ0QlLz2x6
https://t.co/mcnZAJzQs1
https://t.co/2DQLQlLuwy
https://t.co/cFmvwAE7Pk
https://t.co/2Vpz207nXB https://t.co/q4qXxrAej4
https://t.co/KRTU5rirXS
https://t.co/KqM77ahhEN https://t.co/DHNdUuRMs7
https://t.co/uiK8TcHtjK
https://t.co/3iHYp4FG4J
https://t.co/8StRWRpv2M
https://t.co/zuIzbpejyw
https://t.co/6GSxBbIAGs
https://t.co/H93WoEnv4D
https://t.co/h4IJw703xt
https://t.co/8aJUZkCH1V
https://t.co/KhE3g0tp2P
https://t.co/WylTJVXtuI
https://t.co/mFI9B57kCi
h

In [30]:
pol_tweet_data['is_eng'] = pol_tweet_data.content.apply(detect_english)
pol_tweet_data = pol_tweet_data.loc[pol_tweet_data.is_eng == 1]

https://t.co/3tl3y5e2MP
https://t.co/b3i0Bxcb9V
https://t.co/C7GK4fPKqy
https://t.co/57nTo4Hi8i
https://t.co/x3J4WDMKn2
http://bit.ly/cRTlV9
https://t.co/dccfuwYIHS
http://fb.me/AuzqP3gs
http://t.co/7GPS7l6aC2 http://t.co/w5I3nZqQZi
http://t.co/JXJgJ2Fech http://t.co/1aWPM8Ak7J
https://t.co/R4nslZgxsp
https://t.co/zIwDcASYQo
https://t.co/r8bwIevEdr https://t.co/fL5sRIrkQs
http://t.co/VEtShOUATm http://t.co/EJy6orzoUF
 https://t.co/IvWFN6a0r4
http://t.co/NJ77jpMG
https://t.co/LOMoSOMJQV
https://t.co/KNmkm6y1P2
[?]?
https://t.co/bTSpws5IF7
https://t.co/8KuRPDI0Gi
https://t.co/CvEEy6saem
http://fb.me/x0ijmc6d
https://t.co/3QcRQSlYPg
http://t.co/5NGaHDdS
huhhh@kjzzphoenixwgH
https://t.co/TLWkR26DUx
http://fb.me/uc0eMvpg
https://t.co/inVvS0NcVd
http://t.co/UqjfxGiK http://t.co/hjaXDwTG
http://t.co/6vjWL0Up http://t.co/ZCEQ5tzU
https://t.co/OwjjdGvtWy
http://fb.me/Qk6xSne2
https://t.co/HxWpoGzDxq
https://t.co/EHnBWBDEuX
https://t.co/97RunK9HV2
https://t.co/ZDxhIIkxNe
http://fb.me/VkdkuGb3
ht

https://t.co/4fjrU9jtok
.@Michael_CAFC https://t.co/kMbOeBaJka
http://t.co/XBILnWZ3dA
https://t.co/kN3V4orOxK
https://t.co/tdgPjSepeo
http://t.co/ToDj3lbeyT
https://t.co/MIR0YqcdMR
https://t.co/ly3DENZiMi
https://t.co/IHOWYOLSl2
https://t.co/Wqgv2VU7x5
http://t.co/Nm0hfs98mg
http://t.co/O9DEBZY87T http://t.co/i3ak6sFzcI
http://t.co/INEAHkC0... http://t.co/9liTYMlT
https://t.co/Ouc1qxzAjh
https://t.co/r7FZcMuQEO
http://t.co/rTABIeISrh
http://t.co/YzU0TC4g3u http://t.co/bVNZXExrKq
http://www.facebook.com/photo.php?pid=4307371&l=7e3600f898&id=23444159584
http://t.co/WJxxG9ixPh
https://t.co/gYDRbNPc0c
http://t.co/T5RjeesT http://t.co/4aJWT9bE
 https://t.co/RwVQ1HvmfS
http://burrforsenate.com/response/
https://t.co/47MlppUj36
http://twitpic.com/2bv32f
https://t.co/TjwgerC1Dt
https://t.co/mP5la3wOug
https://t.co/zun3thEZsY
http://t.co/9AY5qQbX6K
https://t.co/h59kDBwBva
https://t.co/GmOrE9Qq5t
https://t.co/yZLdPFR7TI
http://t.co/izf8WNLKtB
https://t.co/2Is8N7x4Mf
https://t.co/sAvYpJcGDg
https

In [33]:
print("Troll tweets size:", troll_tweet_data.shape)
print("Politician tweets size: ", pol_tweet_data.shape)
print("News tweets size:", news_tweet_data.shape)

Troll tweets size: (72964, 8)
Politician tweets size:  (196525, 8)
News tweets size: (41069, 9)


In [42]:
print("NAs:")
print("Troll:", troll_tweet_data.account_category.isna().sum())
print("Politician:", pol_tweet_data.account_category.isna().sum())
print("News:", news_tweet_data.account_category.isna().sum())

NAs:
Troll: 0
Politician: 0
News: 0


In [43]:
troll_tweet_data = troll_tweet_data.sample(n=news_tweet_data.shape[0], random_state=42)
pol_tweet_data = pol_tweet_data.sample(n=news_tweet_data.shape[0], random_state=42)

In [44]:
combined_tweet_data = pd.concat([troll_tweet_data, pol_tweet_data, news_tweet_data], ignore_index=True, sort=False)

In [45]:
combined_tweet_data.to_csv("./datasets/combined_data.csv", sep=";", index=False)