In [144]:
# Task is to test automatic ways to classify viral social media content propagating fake images or representing real images in a false context
# In this coursework I analyze use cases
# Design machine learning algorithms
# Evaluate the resulting implementations

In [160]:
# Importing Libraries

import pandas as pd
import matplotlib.pyplot as plt
import re
from langdetect import detect
from nltk.corpus import stopwords
from collections import Counter
from textblob import TextBlob
import numpy as np
import plotly.express as px


In [161]:
#Load datasets
train_data = pd.read_csv("Datasets/mediaeval-2015-trainingset.txt", sep="\t", lineterminator='\n', skiprows=(0),  header=(0))
test_data = pd.read_csv("Datasets/mediaeval-2015-testset.txt", sep="\t", lineterminator='\n', skiprows=(0),  header=(0))


In [162]:
# ---- DATA ANALYSIS ---

#Manual inspection of the dataset
train_data.head()

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,263046056240115712,¿Se acuerdan de la película: “El día después d...,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake
1,262995061304852481,@milenagimon: Miren a Sandy en NY! Tremenda i...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
2,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
3,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
4,263018881839411200,My fave place in the world #nyc #hurricane #sa...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake


In [163]:
# Examining overall volume and quality of the test dataset

test_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 3755 entries, 0 to 3754
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     3755 non-null   int64 
 1   tweetText   3755 non-null   object
 2   userId      3755 non-null   int64 
 3   imageId(s)  3755 non-null   object
 4   username    3755 non-null   object
 5   timestamp   3755 non-null   object
 6   label       3755 non-null   object
dtypes: int64(2), object(5)
memory usage: 205.5+ KB


In [164]:
# Examining overall volume and quality of the training dataset
train_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 7 columns):
 #   Column      Non-Null Count  Dtype 
---  ------      --------------  ----- 
 0   tweetId     14277 non-null  int64 
 1   tweetText   14277 non-null  object
 2   userId      14277 non-null  int64 
 3   imageId(s)  14277 non-null  object
 4   username    14277 non-null  object
 5   timestamp   14277 non-null  object
 6   label       14277 non-null  object
dtypes: int64(2), object(5)
memory usage: 780.9+ KB


In [165]:
# Getting the feature names
train_data.columns

Index(['tweetId', 'tweetText', 'userId', 'imageId(s)', 'username', 'timestamp',
       'label'],
      dtype='object')

In [166]:
# Renaming imageId(s) to imageId as it can cause issues further in the project
train_data.rename(columns={'imageId(s)': 'imageId'}, inplace = True)
test_data.rename(columns={'imageId(s)': 'imageId'}, inplace = True)

In [167]:
#Counting fake and humor labels in data
train_data.isin(['fake', 'humor']).sum(axis=0)

tweetId         0
tweetText       0
userId          0
imageId         0
username        0
timestamp       0
label        9356
dtype: int64

In [168]:
#Counting number of entries with identican tweetText
print("Train_data entries with duplicate tweetText")
train_data['tweetText'].duplicated().sum()

Train_data entries with duplicate tweetText


1901

In [169]:
train_data['tweetText'].describe()

count                                                 14277
unique                                                12376
top       Unbelievable scene flying over #StatenIsland i...
freq                                                     42
Name: tweetText, dtype: object

In [155]:
# ---- Training Data Analysis ---
# Format: 'tweetId', 'tweetText', 'userId', 'imageId', 'username', 'timestamp', 'label'
# Separator of features: Tab
# Separator of instances: New Line

# Volume: 14277 non-null entries

# Data quality:
# 1. Different languages
# 2. URLs at the end of each tweet
# 3. Special characters like hashtags and punctuation
# 4. Emojis in tweets
# 5. Typos
# 6. Features are non-null, mostly consisting of text data
# 7. Stopwords in tweets

# Data bias:
# 1. 9356 entries are labeled fake or humor
# 2. 1901 entries have the same tweetText and URL at the end of the tweetText
# 3. Additional 1060 entries have identical tweetText

# ---- Test Data Analysis ---
# Format, separator of features and instances same as in training data
# Volume: 3755 non-null entries

In [156]:
# Mapping labels to numerical data, and combinding fake with humor labels together
train_data['label'] = train_data['label'].map({'fake': 1, 'humor': 1, 'real': 0})
test_data['label'] = test_data['label'].map({'fake': 1, 'humor': 1, 'real': 0})

In [157]:
# ---- Data Visualization ---

# Tweet counts by label
fig = px.histogram(train_data, x='label', template='plotly_white', title='Tweet counts by label')
fig.update_xaxes(categoryorder='category descending', title='Label').update_yaxes(title='Number of tweets')
fig.show()

In [200]:
#Plot the lengths of tweets
train_data['length'] = train_data['tweetText'].str.len()
fig = px.histogram(train_data, x='length', template='plotly_white', title='Length of tweets')
fig.update_xaxes(categoryorder='category descending', title='length').update_yaxes(title='Tweets')
fig.show()

In [201]:
fig.update_layout(xaxis_range=[0,180])
fig.show()

In [208]:
fig.update_layout(xaxis_range=[180,7300])
fig.update_layout(yaxis_range=[0, 3])
fig.show()

In [None]:
# TODO - LANGUAGE DETECTION

In [140]:
# ---- Algorithm Design ---

# Preprocessing

In [113]:
# Creating a method for most relevant preprocessing steps
def myPreprocess(text):
    # Lower casing
    text = text.lower()
    # URL removal
    text = re.split('http.*', str(text))[0]
    # Noise removal
    text = text.replace('[^\w\s]', '')
    text = text.replace('#', '')
    return text

# Applying basic preprocessing to train_data and test_Data
train_data['cleanText'] = train_data['tweetText'].apply(myPreprocess)
test_data['cleanText'] = test_data['tweetText'].apply(myPreprocess)

In [114]:
train_data[['tweetText', 'cleanText', 'label']].head(6)

Unnamed: 0,tweetText,cleanText,label
0,¿Se acuerdan de la película: “El día después d...,¿se acuerdan de la película: “el día después d...,1
1,@milenagimon: Miren a Sandy en NY! Tremenda i...,@milenagimon: miren a sandy en ny! tremenda i...,1
2,"Buena la foto del Huracán Sandy, me recuerda a...","buena la foto del huracán sandy, me recuerda a...",1
3,Scary shit #hurricane #NY http://t.co/e4JLBUfH,scary shit hurricane ny,1
4,My fave place in the world #nyc #hurricane #sa...,my fave place in the world nyc hurricane sandy...,1
5,42nd #time #square #NYC #subway #hurricane htt...,42nd time square nyc subway hurricane,1


In [115]:
#Counting how many entries have identical tweetText
train_data['cleanText'].duplicated().sum()

2961

In [116]:
#Removing duplicate rows
train_data.drop_duplicates(subset=['cleanText'], keep='first', inplace=True, ignore_index = False)

In [117]:
# Emoji removal | Source: https://gist.github.com/slowkow/7a7f61f495e3dbb7e3d767f97bd7304b
def remove_emoji(string):
    emoji_pattern = re.compile("["
    u"\U0001F600-\U0001F64F"  # emoticons
    u"\U0001F300-\U0001F5FF"  # symbols & pictographs
    u"\U0001F680-\U0001F6FF"  # transport & map symbols
    u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
    u"\U00002702-\U000027B0"
    u"\U000024C2-\U0001F251"
    "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

train_data['cleanText'] = train_data['cleanText'].apply(remove_emoji)
test_data['cleanText'] = test_data['cleanText'].apply(remove_emoji)

In [118]:
# Stop word removal | Source: https://towardsdatascience.com/text-preprocessing-for-data-scientist-3d2419c8199d
stopwordsString = set(stopwords.words('english'))

def remStopW(text):
    return " ".join([word for word in str(text).split() if word not in stopwordsString])

train_data['cleanText'] = train_data['cleanText'].apply(remStopW)
test_data['cleanText'] = test_data['cleanText'].apply(remStopW)

In [119]:
train_data[['tweetText', 'cleanText', 'label']].head(6)

Unnamed: 0,tweetText,cleanText,label
0,¿Se acuerdan de la película: “El día después d...,¿se acuerdan de la película: “el día después d...,1
1,@milenagimon: Miren a Sandy en NY! Tremenda i...,@milenagimon: miren sandy en ny! tremenda imag...,1
2,"Buena la foto del Huracán Sandy, me recuerda a...","buena la foto del huracán sandy, recuerda la p...",1
3,Scary shit #hurricane #NY http://t.co/e4JLBUfH,scary shit hurricane ny,1
4,My fave place in the world #nyc #hurricane #sa...,fave place world nyc hurricane sandy statueofl...,1
5,42nd #time #square #NYC #subway #hurricane htt...,42nd time square nyc subway hurricane,1
