In [258]:
# Import libraries 

import numpy as np # linear algebra
import pandas as pd # data processing, 

# Libraries for data visualization
import matplotlib.pyplot as pplt  
import plotly.express as px


data = pd.read_csv("mediaeval-2015-trainingset.txt", sep="\t", lineterminator='\n', skiprows=(0),  header=(0))
# Source: https://stackoverflow.com/questions/27896214/reading-tab-delimited-file-with-pandas-works-on-windows-but-not-on-mac


In [244]:
data.head()

Unnamed: 0,tweetId,tweetText,userId,imageId(s),username,timestamp,label
0,263046056240115712,¿Se acuerdan de la película: “El día después d...,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,fake
1,262995061304852481,@milenagimon: Miren a Sandy en NY! Tremenda i...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,fake
2,262979898002534400,"Buena la foto del Huracán Sandy, me recuerda a...",132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,fake
3,262996108400271360,Scary shit #hurricane #NY http://t.co/e4JLBUfH,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,fake
4,263018881839411200,My fave place in the world #nyc #hurricane #sa...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,fake


In [259]:
data.rename(columns={'imageId(s)': 'imageId'}, inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 14277 entries, 0 to 14276
Data columns (total 7 columns):
 #   Column     Non-Null Count  Dtype 
---  ------     --------------  ----- 
 0   tweetId    14277 non-null  int64 
 1   tweetText  14277 non-null  object
 2   userId     14277 non-null  int64 
 3   imageId    14277 non-null  object
 4   username   14277 non-null  object
 5   timestamp  14277 non-null  object
 6   label      14277 non-null  object
dtypes: int64(2), object(5)
memory usage: 780.9+ KB


In [260]:
#Counting fake and humor labels in data
data.isin(['fake', 'humor']).sum(axis=0)

tweetId         0
tweetText       0
userId          0
imageId         0
username        0
timestamp       0
label        9356
dtype: int64

In [261]:
#Finding how many tweets have identical text and could affect the classifier
data['tweetText'].duplicated().sum()
#Returns 1901

#Removing the duplicate rows
data.drop_duplicates(subset=['tweetText'], keep='first', inplace=True, ignore_index=False)

In [262]:
data['tweetText'].describe()

count                                                 12376
unique                                                12376
top       Guards at Tomb of Unknowns aren't intimidated ...
freq                                                      1
Name: tweetText, dtype: object

In [263]:
#Mapping the labels to numerical data to convert from categorical data
data['label'] = data['label'].map({'fake': 1, 'humor': 1, 'real': 0})

In [264]:
data.loc[data['label'] == 0]

Unnamed: 0,tweetId,tweetText,userId,imageId,username,timestamp,label
7741,263012737603473408,Wow RT@billmckibben: Boardwalk floating in sec...,14070799,sandyA_real_10,Eklund,Mon Oct 29 20:21:37 +0000 2012,0
7742,263110625377546240,@JHP_777 RT @billmckibben: Boardwalk floating ...,31626634,sandyA_real_10,track7grrl,Tue Oct 30 02:50:36 +0000 2012,0
7743,263022544708308992,#sandy #staysafe RT @billmckibben: Boardwalk f...,16666806,sandyA_real_10,UPI,Mon Oct 29 21:00:36 +0000 2012,0
7744,263351427320131584,Ground Zero #Sandy http://t.co/KA1jNv3I,15933769,sandyA_real_04,AngelaYvonne,Tue Oct 30 18:47:27 +0000 2012,0
7745,263163432268861440,scary: “@jfporchez: Ground zero http://t.co/wn...,68701170,sandyA_real_04,rosknopov,Tue Oct 30 06:20:26 +0000 2012,0
...,...,...,...,...,...,...,...
12859,325008771422244864,#prayforboston #suspect #suspects #bostonsuspe...,1059330391,boston_real_29,LisaAnneKiraly,Thu Apr 18 22:11:44 +0000 2013,0
12860,325008198329307137,#prayforboston #suspect #suspects #bostonsuspe...,1059330391,boston_real_24,LisaAnneKiraly,Thu Apr 18 22:09:27 +0000 2013,0
12861,325009065166131200,#prayforboston #suspect #suspects #bostonsuspe...,1059330391,boston_real_28,LisaAnneKiraly,Thu Apr 18 22:12:54 +0000 2013,0
12862,325008834554896384,#prayforboston #suspect #suspects #bostonsuspe...,1059330391,boston_real_02,LisaAnneKiraly,Thu Apr 18 22:11:59 +0000 2013,0


In [265]:
#Data vis
#https://medium.com/plotly/nlp-visualisations-for-clear-immediate-insights-into-text-data-and-outputs-9ebfab168d5b
fig = px.histogram(data, x='label', template='plotly_white', title='Tweet counts by label')
fig.update_xaxes(categoryorder='category descending', title='Label').update_yaxes(title='Number of tweets')
fig.show()

In [266]:
fig = px.histogram(data, x='username', template='plotly_white', title='Tweet counts by username',
                  nbins=6)
fig.update_xaxes(categoryorder='category descending', title='Username').update_yaxes(title='Number of tweets')
fig.show()

In [267]:
fig = px.histogram(data, x='imageId', template='plotly_white', title='Tweet counts by imageId')
fig.update_xaxes(categoryorder='category descending', title='ImageId').update_yaxes(title='Number of tweets')
fig.show()

In [268]:
# Some fiddling with TextBlob
from textblob import TextBlob

print(data.at[0, 'tweetText'])

blob = TextBlob(data.at[0, 'tweetText'])
print(blob.detect_language())
#blob = TextBlob("I am trying to translate an English sentence into English.")
blob.translate(to='en')
#Source: https://medium.com/@patilpratik699/introduction-to-textblob-a-tool-for-natural-language-processing-7f0a225ed0ee

¿Se acuerdan de la película: “El día después de mañana”? Me recuerda a lo que está pasando con el huracán #Sandy. http://t.co/JQQeRPwN


HTTPError: HTTP Error 429: Too Many Requests

In [269]:
#Source: https://towardsdatascience.com/a-complete-exploratory-data-analysis-and-visualization-for-text-data-29fb1b96fb6a
#Preprocessing 

#According to https://towardsdatascience.com/all-you-need-to-know-about-text-preprocessing-for-nlp-and-machine-learning-bc1c5765ff67
#Must do:
# Noise removal
# Lowercasing

#Should do:
# Simple normalization

#Task dependent:
#1. Advanced noprmalizationd
#2. Stop word removal
#3. Stemming/lemmatization
#4. Text enrichment

#Lower Casing
data['tweetText'] = data['tweetText'].str.lower()

#Removal of link at the end
import re
data['tweetText'] = data['tweetText'].apply(lambda text: re.split('http.*', str(text))[0])

#Noise removal
data['tweetText'] = data['tweetText'].str.replace('[^\w\s]', '')

data[['tweetText', 'label']].head()


#data['tweetText'] = data['tweetText'].map(lambda text: TextBlob(text).translate(to='en') if TextBlob(text).detect_language() != 'en' else text)

Unnamed: 0,tweetText,label
0,se acuerdan de la película el día después de m...,1
1,milenagimon miren a sandy en ny tremenda imag...,1
2,buena la foto del huracán sandy me recuerda a ...,1
3,scary shit hurricane ny,1
4,my fave place in the world nyc hurricane sandy...,1


In [270]:
#Correct spelling
#data['tweetText'][:50].apply(lambda x: str(TextBlob(x).correct()))

#data[['tweetText', 'label']].head()

#Quite expensive

In [271]:
#Finding how many tweets have identical text and could affect the classifier
data['tweetText'].duplicated().sum()
#Returns 1901

#Removing the duplicate rows
data.drop_duplicates(subset=['tweetText'], keep='first', inplace=True, ignore_index=False)

In [272]:
#Describe the tweetText column
data['tweetText'].describe()

count                                                 11074
unique                                                11074
top       wow new york taxis after sandy kelseyjharpers ...
freq                                                      1
Name: tweetText, dtype: object

In [273]:
#Plot the length of tweets
data['length'] = data['tweetText'].str.len()
fig = px.histogram(data, x='length', template='plotly_white', title='Length of tweets')
fig.update_xaxes(categoryorder='category descending', title='length').update_yaxes(title='Tweets')
fig.show()

In [274]:
# Removing tweets that are shorter than 4 characters
data = data[data['length'] > 4]

In [275]:
# Emoji removal
# Function to remove emoji from github
def emoji(string):
    emoji_pattern = re.compile("["
                           u"\U0001F600-\U0001F64F"  # emoticons
                           u"\U0001F300-\U0001F5FF"  # symbols & pictographs
                           u"\U0001F680-\U0001F6FF"  # transport & map symbols
                           u"\U0001F1E0-\U0001F1FF"  # flags (iOS)
                           u"\U00002702-\U000027B0"
                           u"\U000024C2-\U0001F251"
                           "]+", flags=re.UNICODE)
    return emoji_pattern.sub(r'', string)

#passing the emoji function to 'text_rare'
data['tweetText'] = data['tweetText'].apply(emoji)
data[['tweetText', 'label']].head(15)

Unnamed: 0,tweetText,label
0,se acuerdan de la película el día después de m...,1
1,milenagimon miren a sandy en ny tremenda imag...,1
2,buena la foto del huracán sandy me recuerda a ...,1
3,scary shit hurricane ny,1
4,my fave place in the world nyc hurricane sandy...,1
5,42nd time square nyc subway hurricane,1
6,just in time for halloween a photo of hurrican...,1
7,crazy pic of hurricane sandy prayers go out to...,1
8,sandy newyork hurricane statueofliberty usa,1
9,nyc hurricane,1


In [276]:
# Stop word removal
from nltk.corpus import stopwords
STOPWORDS = set(stopwords.words('english'))

def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in STOPWORDS])

data['tweetText'] = data['tweetText'].apply(remove_stopwords)
data.head(5)

Unnamed: 0,tweetId,tweetText,userId,imageId,username,timestamp,label,length
0,263046056240115712,se acuerdan de la película el día después de m...,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,1,107
1,262995061304852481,milenagimon miren sandy en ny tremenda imagen ...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,1,97
2,262979898002534400,buena la foto del huracán sandy recuerda la pe...,132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,1,93
3,262996108400271360,scary shit hurricane ny,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,1,24
4,263018881839411200,fave place world nyc hurricane sandy statueofl...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,1,64


In [277]:
#Detecting language
from langdetect import detect
data['language'] = data['tweetText'].apply(lambda text: detect(text))
data.head(5)

Unnamed: 0,tweetId,tweetText,userId,imageId,username,timestamp,label,length,language
0,263046056240115712,se acuerdan de la película el día después de m...,21226711,sandyA_fake_46,iAnnieM,Mon Oct 29 22:34:01 +0000 2012,1,107,es
1,262995061304852481,milenagimon miren sandy en ny tremenda imagen ...,192378571,sandyA_fake_09,CarlosVerareal,Mon Oct 29 19:11:23 +0000 2012,1,97,es
2,262979898002534400,buena la foto del huracán sandy recuerda la pe...,132303095,sandyA_fake_09,LucasPalape,Mon Oct 29 18:11:08 +0000 2012,1,93,es
3,262996108400271360,scary shit hurricane ny,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,1,24,en
4,263018881839411200,fave place world nyc hurricane sandy statueofl...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,1,64,en


In [278]:
#Display how many languages are in dataset
fig = px.histogram(data, x='language', template='plotly_white', title='Tweet counts by language',
                  nbins=6)
fig.update_xaxes(categoryorder='category descending', title='Language').update_yaxes(title='Number of tweets')
fig.show()

In [279]:
#Translation
def mytranslate(text):
    if TextBlob(text).detect_language != 'en':
        return TextBlob(text).translate(to='en')
    else:
        return text
    
#Other language removal since TextBlob is limited only to 10000 words a day
data = data[data['language'] == 'en']

data[['tweetText', 'userId']].head(5)

Unnamed: 0,tweetText,userId
3,scary shit hurricane ny,241995902
4,fave place world nyc hurricane sandy statueofl...,250315890
5,42nd time square nyc subway hurricane,163674788
6,time halloween photo hurricane sandy frankenstorm,246153081
7,crazy pic hurricane sandy prayers go family fr...,199565482


In [280]:
#Common word removal
from collections import Counter
cnt = Counter()
for text in data['tweetText'].values:
    for word in text.split():
        cnt[word] += 1

cnt.most_common()[-10:]

[('southern', 1),
 ('alabama', 1),
 ('tastes', 1),
 ('snouted', 1),
 ('fauxto', 1),
 ('bobombdom', 1),
 ('slaps', 1),
 ('tweetdeck', 1),
 ('brazil', 1),
 ('meatpigfish', 1)]

In [281]:
#Removal of frequent words
freq = set([w for (w, wc) in cnt.most_common(20)])

def removefreq(text):
    return " ".join([word for word in str(text).split() if word not in freq])


data['cleanText'] = data['tweetText'].apply(removefreq)

In [282]:
#Removal of rare words
freq = set([w for (w, wc) in cnt.most_common()[-20:]])

data['cleanText'] = data['cleanText'].apply(removefreq)

data[['cleanText', 'userId']].head(10)

Unnamed: 0,cleanText,userId
3,scary shit,241995902
4,fave place world statueofliberty,250315890
5,42nd time square subway,163674788
6,time halloween frankenstorm,246153081
7,crazy pic prayers go family friends east coast,199565482
8,statueofliberty usa,78475739
9,,869777653
10,robertosalibaba god u brother,359592461
11,crazy,31305940
12,newjersey swim,51599800


In [283]:
data.replace('', float("NaN"), inplace=True)
data.dropna(subset = ['cleanText'], inplace=True)
data.head(15)

Unnamed: 0,tweetId,tweetText,userId,imageId,username,timestamp,label,length,language,cleanText
3,262996108400271360,scary shit hurricane ny,241995902,sandyA_fake_29,Haaaaarryyy,Mon Oct 29 19:15:33 +0000 2012,1,24,en,scary shit
4,263018881839411200,fave place world nyc hurricane sandy statueofl...,250315890,sandyA_fake_15,princess__natt,Mon Oct 29 20:46:02 +0000 2012,1,64,en,fave place world statueofliberty
5,263364439582060545,42nd time square nyc subway hurricane,163674788,sandyA_fake_23,classycg,Tue Oct 30 19:39:10 +0000 2012,1,38,en,42nd time square subway
6,262927032705490944,time halloween photo hurricane sandy frankenstorm,246153081,sandyA_fake_14,j_unit87,Mon Oct 29 14:41:04 +0000 2012,1,67,en,time halloween frankenstorm
7,263321078884077568,crazy pic hurricane sandy prayers go family fr...,199565482,sandyA_fake_29,MrBlakMagik,Tue Oct 30 16:46:52 +0000 2012,1,84,en,crazy pic prayers go family friends east coast
8,263111677485142017,sandy newyork hurricane statueofliberty usa,78475739,sandyA_fake_15,safi37,Tue Oct 30 02:54:46 +0000 2012,1,44,en,statueofliberty usa
10,262989009930833920,robertosalibaba god u brother sandy hurricane ...,359592461,sandyA_fake_08,Michael_Saliba,Mon Oct 29 18:47:20 +0000 2012,1,63,en,robertosalibaba god u brother
11,263129115207536640,crazy hurricane sandy,31305940,sandyA_fake_29,SLAZARO31,Tue Oct 30 04:04:04 +0000 2012,1,22,en,crazy
12,263091320871063552,shark newjersey swim sandy hurricane,51599800,sandyA_fake_11,anaceciggt,Tue Oct 30 01:33:53 +0000 2012,1,38,en,newjersey swim
13,262990978611286016,good luck ny newyork usa hurricane sandy,125724906,sandyA_fake_29,gsevigny,Mon Oct 29 18:55:10 +0000 2012,1,41,en,good luck usa


In [284]:
#Tokenization
def tokenize(text):
    text = re.split('\W+', text)
    return text
data['tokenized'] = data['cleanText'].apply(lambda x: tokenize(x.lower()))

data[['userId','tokenized']].head(10)

Unnamed: 0,userId,tokenized
3,241995902,"[scary, shit]"
4,250315890,"[fave, place, world, statueofliberty]"
5,163674788,"[42nd, time, square, subway]"
6,246153081,"[time, halloween, frankenstorm]"
7,199565482,"[crazy, pic, prayers, go, family, friends, eas..."
8,78475739,"[statueofliberty, usa]"
10,359592461,"[robertosalibaba, god, u, brother]"
11,31305940,[crazy]
12,51599800,"[newjersey, swim]"
13,125724906,"[good, luck, usa]"


In [285]:
#Stemming and lemmatization
import nltk
from nltk.corpus import wordnet
from nltk.stem import WordNetLemmatizer


lemmatizer = WordNetLemmatizer()
wordnet_map = {"N":wordnet.NOUN, "V":wordnet.VERB, "J":wordnet.ADJ, "R":wordnet.ADV}

def lemmatize_words(text):
    pos_tagged_text = nltk.pos_tag(text.split())
    return " ".join([lemmatizer.lemmatize(word, wordnet_map.get(pos[0], wordnet.NOUN))
                    for word, pos in pos_tagged_text])

data['text_lemma'] = data['cleanText'].apply(lemmatize_words)

data[['userId', 'text_lemma']]

Unnamed: 0,userId,text_lemma
3,241995902,scary shit
4,250315890,fave place world statueofliberty
5,163674788,42nd time square subway
6,246153081,time halloween frankenstorm
7,199565482,crazy pic prayer go family friend east coast
...,...,...
14267,400850456,pigfish like bacon
14268,239253556,loses hope society snopes pigfish animal another
14272,2179310905,pigfish
14273,254843101,specie fish find really good photoshop think


AttributeError: toArray not found

In [287]:
data['text_lemma']

3                                              scary shit
4                        fave place world statueofliberty
5                                 42nd time square subway
6                             time halloween frankenstorm
7            crazy pic prayer go family friend east coast
                               ...                       
14267                                  pigfish like bacon
14268    loses hope society snopes pigfish animal another
14272                                             pigfish
14273        specie fish find really good photoshop think
14276                                    cant decide fish
Name: text_lemma, Length: 7576, dtype: object

In [300]:
from sklearn.preprocessing import OrdinalEncoder

ord_enc = OrdinalEncoder()
data['imageIdCode'] = ord_enc.fit_transform(data[['imageId']])
data[['cleanText', 'imageIdCode']]

Unnamed: 0,cleanText,imageIdCode
3,scary shit,147.0
4,fave place world statueofliberty,133.0
5,42nd time square subway,141.0
6,time halloween frankenstorm,132.0
7,crazy pic prayers go family friends east coast,147.0
...,...,...
14267,pigfish like bacon,118.0
14268,loses hope society snopes pigfish animal another,118.0
14272,pigfish,118.0
14273,species fish found really good photoshop think,118.0


In [341]:
#https://medium.com/vickdata/detecting-hate-speech-in-tweets-natural-language-processing-in-python-for-beginners-4e591952223
#First try with SGDCLassifier
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.linear_model import SGDClassifier, Perceptron, LogisticRegression

pipeline_sgd = Pipeline([
    ('vect', CountVectorizer()),
    ('tfidf',  TfidfTransformer()),
    ('nb', SGDClassifier()),
])

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(data['cleanText'],
                                                   data['label'], random_state=0)

In [342]:
model = pipeline_sgd.fit(X_train, y_train)
y_predict = model.predict(X_test)

from sklearn.metrics import f1_score
f1_score(y_test, y_predict)

0.9266263237518911

In [343]:
from sklearn.metrics import confusion_matrix
confusion_matrix(y_test, y_predict)

array([[ 475,  130],
       [  64, 1225]])

In [353]:
#Hyperparameter tuning
#pipeline_sgd.get_params()

In [345]:
testData = pd.read_csv("mediaeval-2015-testset.txt", sep="\t", lineterminator='\n', skiprows=(0),  header=(0))
#Mapping the labels to numerical data to convert from categorical data
testData['label'] = testData['label'].map({'fake': 1, 'humor': 1, 'real': 0})


y_predict = model.predict(testData['tweetText'])

f1_score(testData['label'], y_predict)

0.7647867950481431

In [346]:
from sklearn.metrics import confusion_matrix
confusion_matrix(testData['label'], y_predict)

array([[ 163, 1046],
       [ 322, 2224]])

In [360]:
#https://towardsdatascience.com/build-and-compare-3-models-nlp-sentiment-prediction-67320979de61
from sklearn.naive_bayes import GaussianNB

cv = CountVectorizer(max_features = 1500)
X = cv.fit_transform(data['cleanText']).toarray()
y = data['label']

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

classifier = GaussianNB()
classifier.fit(X_train, y_train)

y_pred_NB = classifier.predict(X_test)
y_pred_NB

array([0, 1, 0, ..., 1, 1, 1])

In [361]:
f1_score(y_test, y_pred_NB)

0.7117611212675198

In [367]:
cm_NB = confusion_matrix(y_test, y_pred_NB) 
cm_NB


array([[459,  25],
       [448, 584]])

In [368]:
from sklearn.tree import DecisionTreeClassifier

classifier = DecisionTreeClassifier(criterion = 'entropy', random_state = 0)
classifier.fit(X_train, y_train)
y_pred_DT = classifier.predict(X_test)
f1_score(y_test, y_pred_DT)

0.8960155490767736

In [369]:
cm_DT = confusion_matrix(y_test, y_pred_DT) 
cm_DT

array([[380, 104],
       [110, 922]])