# Learning to Detect Fake Content on Twitter

## Contents

* Introduction
* Data
* Feature Engineering
* Classification
* Results
* Tweet Verification Assistant

In [34]:
# import the necessary libraries
import pandas as pd
import matplotlib.pyplot as plt
import json
import seaborn as sns
pd.options.display.max_columns = 100

## Data
### Load and Explore the Data

In [105]:
# load the data
data  = pd.read_csv('data/posts.txt', sep=r'\t', engine='python')
data['post_id'] = data['post_id'].astype(str)
data.shape

(17857, 7)

In [106]:
data.head()

Unnamed: 0,post_id,post_text,user_id,image_id(s),username,timestamp,label
0,324597532548276224,Don't need feds to solve the #bostonbombing wh...,886672620.0,"boston_fake_03,boston_fake_35",SantaCruzShred,Wed Apr 17 18:57:37 +0000 2013,fake
1,325145334739267584,PIC: Comparison of #Boston suspect Sunil Tripa...,21992286.0,boston_fake_23,Oscar_Wang,Fri Apr 19 07:14:23 +0000 2013,fake
2,325152091423248385,I'm not completely convinced that it's this Su...,16428755.0,boston_fake_34,jamwil,Fri Apr 19 07:41:14 +0000 2013,fake
3,324554646976868352,Brutal lo que se puede conseguir en colaboraci...,303138574.0,"boston_fake_03,boston_fake_35",rubenson80,Wed Apr 17 16:07:12 +0000 2013,fake
4,324315545572896768,4chan and the bombing. just throwing it out th...,180460772.0,boston_fake_15,Slimlenny,Wed Apr 17 00:17:06 +0000 2013,fake


In [101]:
# load the tweet data
with open('data/tweetsFeatsVMU2016.txt') as f:
    lines = f.readlines()

lines = [json.loads(line) for line in lines]
tweet_data  = pd.DataFrame(lines)
tweet_data.drop(columns=['_id'], inplace=True)
tweet_data['id'] = tweet_data['id'].astype(str)
print('Data has {} samples with {} attributes'.format(*tweet_data.shape))

Data has 17857 samples with 34 attributes


In [102]:
tweet_data.head()

Unnamed: 0,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,annotation,containsExclamationMark,containsFirstOrderPron,containsHappyEmo,containsQuestionMark,containsSadEmo,containsSecondOrderPron,containsThirdOrderPron,hasColon,hasExternalLink,hasPlease,id,itemLength,numExclamationMark,numHashtags,numMentions,numNegSentiWords,numNouns,numPosSentiWords,numQuestionMark,numSlangs,numURLs,numUppercaseChars,numWords,readability,retweetCount,urlHarmonic,urlIndegree,wotSafe,wotTrust
0,,,,,fake,False,False,False,True,False,False,True,True,False,False,263046056240115712,134,0,1,0,0.0,,0.0,1,1.0,1,3,25,,0,,,,
1,115.0,-26.0,373.0,284.0,fake,True,False,False,False,False,False,True,True,True,False,262995061304852481,133,2,0,1,0.0,,0.0,0,0.0,1,14,19,,0,,,,70.0
2,,,,,fake,False,False,False,False,False,False,False,False,False,False,262979898002534400,116,0,2,0,0.0,,0.0,0,1.0,1,5,17,,0,,,,
3,,,,,fake,False,False,False,False,False,False,False,False,False,False,262996108400271360,46,0,2,0,2.0,1.0,0.0,0,1.0,1,1,4,77.905,0,,,,
4,,,,,fake,False,False,False,False,False,False,False,False,False,False,263018881839411200,90,0,4,0,0.0,3.0,1.0,0,0.0,1,1,11,116.145,0,,,,


In [103]:
# load the user data
with open('data/userFeatsVMU2016.txt') as f:
    lines = f.readlines()

lines = [json.loads(line) for line in lines]
user_data = pd.DataFrame(lines)
user_data.drop(columns=['_id'], inplace=True)
user_data['id'] = user_data['id'].astype(str)

print('User data has {} samples with {} attributes'.format(*user_data.shape))


User data has 17857 samples with 26 attributes


In [104]:
user_data.head()

Unnamed: 0,FolFrieRatio,accountAge,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,annotation,harmonic,hasBio,hasExistingLocation,hasHeaderImg,hasLocation,hasProfileImg,hasURL,id,indegree,isVerified,numFavorites,numFollowers,numFriends,numMediaContent,numTweets,timesListed,tweetRatio,username,wotTrustUser
0,5.833922,1234982831,,,,,fake,,True,False,False,True,True,False,263046056240115712,,False,5243,1651,283,1069,43811,64,32.47286,iAnnieM,
1,43.68981,1284854676,,,,,fake,18350080.0,True,True,False,True,True,True,262995061304852481,8.0,False,45,95637,2189,10400,54293,737,70.346275,CarlosVerareal,
2,0.862576,1271108498,,,,,fake,,True,False,False,True,True,False,262979898002534400,,False,274,1701,1972,6455,34414,13,36.97036,LucasPalape,
3,0.88015,1295804773,,,,,fake,,True,False,False,True,True,False,262996108400271360,,False,6,235,267,0,17837,0,27.651606,Haaaaarryyy,
4,0.937824,1297372694,,,,,fake,,True,False,False,True,True,False,263018881839411200,,False,2867,181,193,1223,25754,0,41.076424,princess__natt,


### Check the missing values

In [52]:
features=tweet_data.columns.values
for feature in features:
    print(feature,'- Missing:', 
          sum(tweet_data[feature].isnull()),
          '- Unique:', len(tweet_data[feature].unique()))

alexaCountryRank - Missing: 15535 - Unique: 308
alexaDeltaRank - Missing: 15535 - Unique: 415
alexaPopularity - Missing: 15535 - Unique: 468
alexaReachRank - Missing: 15535 - Unique: 468
annotation - Missing: 2228 - Unique: 3
containsExclamationMark - Missing: 0 - Unique: 2
containsFirstOrderPron - Missing: 2710 - Unique: 3
containsHappyEmo - Missing: 0 - Unique: 2
containsQuestionMark - Missing: 0 - Unique: 2
containsSadEmo - Missing: 0 - Unique: 2
containsSecondOrderPron - Missing: 2710 - Unique: 3
containsThirdOrderPron - Missing: 2710 - Unique: 3
hasColon - Missing: 0 - Unique: 2
hasExternalLink - Missing: 0 - Unique: 2
hasPlease - Missing: 0 - Unique: 2
id - Missing: 0 - Unique: 17857
itemLength - Missing: 0 - Unique: 128
numExclamationMark - Missing: 0 - Unique: 15
numHashtags - Missing: 0 - Unique: 17
numMentions - Missing: 0 - Unique: 8
numNegSentiWords - Missing: 2710 - Unique: 8
numNouns - Missing: 2603 - Unique: 26
numPosSentiWords - Missing: 2710 - Unique: 8
numQuestionMark

#### Number of positive/negative words

In [115]:
pd.merge(tweet_data.loc[tweet_data['numPosSentiWords'].isna()], 
         data[['post_id', 'post_text']], left_on='id', right_on='post_id').head()

Unnamed: 0,alexaCountryRank,alexaDeltaRank,alexaPopularity,alexaReachRank,annotation,containsExclamationMark,containsFirstOrderPron,containsHappyEmo,containsQuestionMark,containsSadEmo,containsSecondOrderPron,containsThirdOrderPron,hasColon,hasExternalLink,hasPlease,id,itemLength,numExclamationMark,numHashtags,numMentions,numNegSentiWords,numNouns,numPosSentiWords,numQuestionMark,numSlangs,numURLs,numUppercaseChars,numWords,readability,retweetCount,urlHarmonic,urlIndegree,wotSafe,wotTrust,post_id,post_text
0,,,,,fake,False,,False,False,False,,,False,False,False,263111677485142017,69,0,5,0,,4.0,,0,,1,0,5,,0,,,,,263111677485142017,#sandy #newyork #hurricane #statueofliberty #U...
1,,,,,fake,False,,False,False,False,,,False,False,False,262977091983785985,36,0,2,0,,1.0,,0,,1,0,2,,0,,,,,262977091983785985,#nyc #hurricane http://t.co/Gv3QxZlq
2,,,,,fake,False,,False,False,False,,,False,False,False,263129115207536640,45,0,3,0,,,,0,,1,0,3,,0,,,,,263129115207536640,#Crazy #Hurricane #Sandy http://t.co/0zrMsgvs
3,,,,,fake,False,,False,False,False,,,False,False,False,263091320871063552,66,0,5,0,,1.0,,0,,1,0,5,,0,,,,,263091320871063552,#shark #newjersey #swim #sandy #hurricane  ...
4,,,,,fake,False,,False,False,False,,,False,False,False,263047501433688064,53,0,1,1,,,,0,,1,1,6,,1,,,,,263047501433688064,Craziest picture ever #hurricane http://t.co/K...


In [117]:
tweet_data['numPosSentiWords'].fillna(0, inplace=True)
tweet_data['numNegSentiWords'].fillna(0, inplace=True)

Notes/Ideas
* predict num of nouns for missing data by looking into the ratio nouns per number of words on the rest of the tweets
* linear regression for the credibility metrics