# Learning to Detect Fake Content on Twitter

## Contents

* Introduction
* Data
* Feature Engineering
* Classification
* Results
* Tweet Verification Assistant

In [40]:
# import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import json
import seaborn as sns
pd.options.display.max_columns = 100

## Data
### Load and Explore the Data

In [41]:
# load the data
data  = pd.read_csv('data/tweets_info.csv')
data['id'] = data['id'].astype(str)
data.shape

(17857, 4)

In [42]:
data.head()

Unnamed: 0,id,text,event,label
0,324597532548276224,Don't need feds to solve the #bostonbombing wh...,Boston,fake
1,325145334739267584,PIC: Comparison of #Boston suspect Sunil Tripa...,Boston,fake
2,325152091423248385,I'm not completely convinced that it's this Su...,Boston,fake
3,324554646976868352,Brutal lo que se puede conseguir en colaboraci...,Boston,fake
4,324315545572896768,4chan and the bombing. just throwing it out th...,Boston,fake


In [43]:
# load the tweet data
with open('data/tweetsFeatsVMU2016.txt') as f:
    lines = f.readlines()

lines = [json.loads(line) for line in lines]
tweet_data  = pd.DataFrame(lines)
tweet_data.drop(columns=['_id', 'annotation'], inplace=True)
tweet_data['id'] = tweet_data['id'].astype(str)
print('Data has {} samples with {} attributes'.format(*tweet_data.shape))

Data has 17857 samples with 33 attributes


In [44]:
tweet_data.tail()

Unnamed: 0,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,containsExclamationMark,containsFirstOrderPron,containsHappyEmo,containsQuestionMark,containsSadEmo,containsSecondOrderPron,containsThirdOrderPron,hasColon,hasExternalLink,hasPlease,id,itemLength,numExclamationMark,numHashtags,numMentions,numNegSentiWords,numNouns,numPosSentiWords,numQuestionMark,numSlangs,numURLs,numUppercaseChars,numWords,readability,retweetCount,urlHarmonic,urlIndegree,wotSafe,wotTrust
17852,79.0,28.0,344.0,272.0,False,False,False,False,False,False,False,True,True,False,712226185707712512,129,0,1,0,0.0,5.0,0.0,0,1.0,2,14,13,41.852,0,22020096.0,118384.0,,63.0
17853,79.0,28.0,344.0,272.0,False,False,False,False,False,False,False,True,True,False,712226175721066496,119,0,1,0,0.0,5.0,0.0,0,1.0,2,6,12,61.325,63,22020096.0,118384.0,,63.0
17854,,,,,False,False,False,False,False,False,False,True,False,False,712222764514484224,135,0,0,0,0.0,9.0,0.0,0,2.0,1,3,19,61.89,0,,,,
17855,,,,,False,False,False,False,False,False,False,True,False,False,712210186438631424,115,0,0,0,0.0,5.0,0.0,0,0.0,1,9,13,37.455,0,,,,
17856,,,,,False,False,False,False,False,False,False,True,False,False,712210186350493696,115,0,0,0,0.0,5.0,0.0,0,0.0,1,9,13,37.455,0,,,,


In [45]:
# load the user data
with open('data/userFeatsVMU2016.txt') as f:
    lines = f.readlines()

lines = [json.loads(line) for line in lines]
user_data = pd.DataFrame(lines)
user_data.drop(columns=['_id', 'annotation'], inplace=True)
user_data['id'] = user_data['id'].astype(str)

print('User data has {} samples with {} attributes'.format(*user_data.shape))


User data has 17857 samples with 25 attributes


In [46]:
user_data.head()

Unnamed: 0,FolFrieRatio,accountAge,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,harmonic,hasBio,hasExistingLocation,hasHeaderImg,hasLocation,hasProfileImg,hasURL,id,indegree,isVerified,numFavorites,numFollowers,numFriends,numMediaContent,numTweets,timesListed,tweetRatio,username,wotTrustUser
0,5.833922,1234982831,,,,,,True,False,False,True,True,False,263046056240115712,,False,5243,1651,283,1069,43811,64,32.47286,iAnnieM,
1,43.68981,1284854676,,,,,18350080.0,True,True,False,True,True,True,262995061304852481,8.0,False,45,95637,2189,10400,54293,737,70.346275,CarlosVerareal,
2,0.862576,1271108498,,,,,,True,False,False,True,True,False,262979898002534400,,False,274,1701,1972,6455,34414,13,36.97036,LucasPalape,
3,0.88015,1295804773,,,,,,True,False,False,True,True,False,262996108400271360,,False,6,235,267,0,17837,0,27.651606,Haaaaarryyy,
4,0.937824,1297372694,,,,,,True,False,False,True,True,False,263018881839411200,,False,2867,181,193,1223,25754,0,41.076424,princess__natt,


#### Merge data to have a complete view

In [47]:
tweet_data = pd.merge(tweet_data, data, on='id')
tweet_data.head()

Unnamed: 0,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,containsExclamationMark,containsFirstOrderPron,containsHappyEmo,containsQuestionMark,containsSadEmo,containsSecondOrderPron,containsThirdOrderPron,hasColon,hasExternalLink,hasPlease,id,itemLength,numExclamationMark,numHashtags,numMentions,numNegSentiWords,numNouns,numPosSentiWords,numQuestionMark,numSlangs,numURLs,numUppercaseChars,numWords,readability,retweetCount,urlHarmonic,urlIndegree,wotSafe,wotTrust,text,event,label
0,,,,,False,False,False,True,False,False,True,True,False,False,263046056240115712,134,0,1,0,0.0,,0.0,1,1.0,1,3,25,,0,,,,,¿Se acuerdan de la película: “El día después d...,Sandy,fake
1,115.0,-26.0,373.0,284.0,True,False,False,False,False,False,True,True,True,False,262995061304852481,133,2,0,1,0.0,,0.0,0,0.0,1,14,19,,0,,,,70.0,@milenagimon: Miren a Sandy en NY! Tremenda i...,Sandy,fake
2,,,,,False,False,False,False,False,False,False,False,False,False,262979898002534400,116,0,2,0,0.0,,0.0,0,1.0,1,5,17,,0,,,,,"Buena la foto del Huracán Sandy, me recuerda a...",Sandy,fake
3,,,,,False,False,False,False,False,False,False,False,False,False,262996108400271360,46,0,2,0,2.0,1.0,0.0,0,1.0,1,1,4,77.905,0,,,,,Scary shit #hurricane #NY http://t.co/e4JLBUfH,Sandy,fake
4,,,,,False,False,False,False,False,False,False,False,False,False,263018881839411200,90,0,4,0,0.0,3.0,1.0,0,0.0,1,1,11,116.145,0,,,,,My fave place in the world #nyc #hurricane #sa...,Sandy,fake


In [48]:
user_data = pd.merge(user_data, data, on='id')
user_data.head()

Unnamed: 0,FolFrieRatio,accountAge,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,harmonic,hasBio,hasExistingLocation,hasHeaderImg,hasLocation,hasProfileImg,hasURL,id,indegree,isVerified,numFavorites,numFollowers,numFriends,numMediaContent,numTweets,timesListed,tweetRatio,username,wotTrustUser,text,event,label
0,5.833922,1234982831,,,,,,True,False,False,True,True,False,263046056240115712,,False,5243,1651,283,1069,43811,64,32.47286,iAnnieM,,¿Se acuerdan de la película: “El día después d...,Sandy,fake
1,43.68981,1284854676,,,,,18350080.0,True,True,False,True,True,True,262995061304852481,8.0,False,45,95637,2189,10400,54293,737,70.346275,CarlosVerareal,,@milenagimon: Miren a Sandy en NY! Tremenda i...,Sandy,fake
2,0.862576,1271108498,,,,,,True,False,False,True,True,False,262979898002534400,,False,274,1701,1972,6455,34414,13,36.97036,LucasPalape,,"Buena la foto del Huracán Sandy, me recuerda a...",Sandy,fake
3,0.88015,1295804773,,,,,,True,False,False,True,True,False,262996108400271360,,False,6,235,267,0,17837,0,27.651606,Haaaaarryyy,,Scary shit #hurricane #NY http://t.co/e4JLBUfH,Sandy,fake
4,0.937824,1297372694,,,,,,True,False,False,True,True,False,263018881839411200,,False,2867,181,193,1223,25754,0,41.076424,princess__natt,,My fave place in the world #nyc #hurricane #sa...,Sandy,fake


### Check the missing values

In [49]:
features=tweet_data.columns.values
for feature in features:
    print(feature,'- Missing:', 
          sum(tweet_data[feature].isnull()),
          '- Unique:', len(tweet_data[feature].unique()))

alexaCountryRank - Missing: 15535 - Unique: 308
alexaDeltaRank - Missing: 15535 - Unique: 415
alexaPopularity - Missing: 15535 - Unique: 468
alexaReachRank - Missing: 15535 - Unique: 468
containsExclamationMark - Missing: 0 - Unique: 2
containsFirstOrderPron - Missing: 2710 - Unique: 3
containsHappyEmo - Missing: 0 - Unique: 2
containsQuestionMark - Missing: 0 - Unique: 2
containsSadEmo - Missing: 0 - Unique: 2
containsSecondOrderPron - Missing: 2710 - Unique: 3
containsThirdOrderPron - Missing: 2710 - Unique: 3
hasColon - Missing: 0 - Unique: 2
hasExternalLink - Missing: 0 - Unique: 2
hasPlease - Missing: 0 - Unique: 2
id - Missing: 0 - Unique: 17857
itemLength - Missing: 0 - Unique: 128
numExclamationMark - Missing: 0 - Unique: 15
numHashtags - Missing: 0 - Unique: 17
numMentions - Missing: 0 - Unique: 8
numNegSentiWords - Missing: 2710 - Unique: 8
numNouns - Missing: 2603 - Unique: 26
numPosSentiWords - Missing: 2710 - Unique: 8
numQuestionMark - Missing: 0 - Unique: 8
numSlangs - M

#### Number of positive/negative words

Quick check to see what is the nature of the tweets that present no positive sentiment words.

In [50]:
pd.merge(tweet_data.loc[tweet_data['numPosSentiWords'].isna()], 
         data[['id', 'text']], on='id').head()

Unnamed: 0,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,containsExclamationMark,containsFirstOrderPron,containsHappyEmo,containsQuestionMark,containsSadEmo,containsSecondOrderPron,containsThirdOrderPron,hasColon,hasExternalLink,hasPlease,id,itemLength,numExclamationMark,numHashtags,numMentions,numNegSentiWords,numNouns,numPosSentiWords,numQuestionMark,numSlangs,numURLs,numUppercaseChars,numWords,readability,retweetCount,urlHarmonic,urlIndegree,wotSafe,wotTrust,text_x,event,label,text_y
0,,,,,False,,False,False,False,,,False,False,False,263111677485142017,69,0,5,0,,4.0,,0,,1,0,5,,0,,,,,#sandy #newyork #hurricane #statueofliberty #U...,Sandy,fake,#sandy #newyork #hurricane #statueofliberty #U...
1,,,,,False,,False,False,False,,,False,False,False,262977091983785985,36,0,2,0,,1.0,,0,,1,0,2,,0,,,,,#nyc #hurricane http://t.co/Gv3QxZlq,Sandy,fake,#nyc #hurricane http://t.co/Gv3QxZlq
2,,,,,False,,False,False,False,,,False,False,False,263129115207536640,45,0,3,0,,,,0,,1,0,3,,0,,,,,#Crazy #Hurricane #Sandy http://t.co/0zrMsgvs,Sandy,fake,#Crazy #Hurricane #Sandy http://t.co/0zrMsgvs
3,,,,,False,,False,False,False,,,False,False,False,263091320871063552,66,0,5,0,,1.0,,0,,1,0,5,,0,,,,,#shark #newjersey #swim #sandy #hurricane  ...,Sandy,fake,#shark #newjersey #swim #sandy #hurricane  ...
4,,,,,False,,False,False,False,,,False,False,False,263047501433688064,53,0,1,1,,,,0,,1,1,6,,1,,,,,Craziest picture ever #hurricane http://t.co/K...,Sandy,fake,Craziest picture ever #hurricane http://t.co/K...


Most of them are tweets with lots of hashtags, which were not taken into account while looking for sentiment words. Let's fill the nan values with 0.

In [51]:
tweet_data['numPosSentiWords'].fillna(0, inplace=True)
tweet_data['numNegSentiWords'].fillna(0, inplace=True)

### Feature Engineering

#### Split data

In [None]:
# Split the data into features and target label
tweet_features_raw = tweet_data.drop

#### One-hot encoding
Some of our features are categorical, thus we need to apply one-hot encoding to them

In [52]:
tweet_features = pd.get_dummies(tweet_data)
tweet_features.head()

Unnamed: 0,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,containsExclamationMark,containsHappyEmo,containsQuestionMark,containsSadEmo,hasColon,hasExternalLink,hasPlease,itemLength,numExclamationMark,numHashtags,numMentions,numNegSentiWords,numNouns,numPosSentiWords,numQuestionMark,numSlangs,numURLs,numUppercaseChars,numWords,readability,retweetCount,urlHarmonic,urlIndegree,wotSafe,wotTrust,containsFirstOrderPron_False,containsFirstOrderPron_True,containsSecondOrderPron_False,containsSecondOrderPron_True,containsThirdOrderPron_False,containsThirdOrderPron_True,id_121295835525943296,id_127873228076097536,id_131803489054629888,id_136146926939807744,id_149475310373568513,id_160577948271902721,id_161877619191709697,id_161959304830779392,id_162002314150940674,id_162038640342609920,id_162759262005768194,id_165197214715609090,id_169002779149471745,id_169004010643275778,id_176941527279472640,...,event_GarissaAttack,event_Livr,event_Malaysia,event_Nepal,event_Passport,event_PigFish,event_Samurai,event_Sandy,event_Sochi,event_SyrianBoy,event_Underwater,event_Varoufakis,event_airstrikes,event_american_soldier_quran,event_ankara_explosions,event_attacks_paris,event_black_lion,event_boko_haram,event_bowie_david,event_brussels_car_metro,event_brussels_explosions,event_burst_kfc,event_bush_book,event_convoy_explosion_turkey,event_donald_trump_attacker,event_eagle_kid,event_five_headed_snake,event_fuji_lenticular,event_gandhi_dancing,event_half_everything,event_hubble_telescope,event_immigrants,event_isis_children,event_john_guevara,event_mc_donalds_fee,event_nazi_submarine,event_north_korea,event_not_afraid,event_pakistan_explosion,event_pope_francis,event_protest,event_refugees,event_rio_moon,event_snowboard_girl,event_soldier_stealing,event_syrian_children,event_ukrainian_nazi,event_woman_14_children,label_fake,label_real
0,,,,,False,False,True,False,True,False,False,134,0,1,0,0.0,,0.0,1,1.0,1,3,25,,0,,,,,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
1,115.0,-26.0,373.0,284.0,True,False,False,False,True,True,False,133,2,0,1,0.0,,0.0,0,0.0,1,14,19,,0,,,,70.0,1,0,1,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
2,,,,,False,False,False,False,False,False,False,116,0,2,0,0.0,,0.0,0,1.0,1,5,17,,0,,,,,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
3,,,,,False,False,False,False,False,False,False,46,0,2,0,2.0,1.0,0.0,0,1.0,1,1,4,77.905,0,,,,,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0
4,,,,,False,False,False,False,False,False,False,90,0,4,0,0.0,3.0,1.0,0,0.0,1,1,11,116.145,0,,,,,1,0,1,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,1,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,0,1,0


Notes/Ideas
* predict num of nouns for missing data by looking into the ratio nouns per number of words on the rest of the tweets
* linear regression for the credibility metrics

## Classification

### Tweet-based classifier