In [1]:
#import libraries
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import re
import emoji

#### DATA UNDERSTANDING
---

In [2]:
#load the data
source_tweet = pd.read_csv('twitter15//source_tweets.txt', sep = "\t", header=None, names=["ID", "text"])
label = pd.read_csv('twitter15//label.txt', sep=":", header=None, names=["label", "ID"])

In [3]:
#merge the data by common column, using ID
text_data = pd.merge(source_tweet, label, how = "left", on="ID")

In [4]:
text_data

Unnamed: 0,ID,text,label
0,731166399389962242,🔥ca kkk grand wizard 🔥 endorses @hillaryclinto...,unverified
1,714598641827246081,an open letter to trump voters from his top st...,unverified
2,691809004356501505,america is a nation of second chances —@potus ...,non-rumor
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true
...,...,...,...
1485,692004901455556608,.@potus just announced new reforms to address ...,non-rumor
1486,760109079133990912,“after school satan clubs”? URL,unverified
1487,500281131057811456,breaking news: according to documents released...,unverified
1488,523098334421319680,ebola vaccines? URL #news #today,false


In [5]:
#check for null values
text_data.isnull().sum()

ID       0
text     0
label    0
dtype: int64

In [6]:
#get list of unique labels
label_list = list(text_data['label'].unique())
label_list

['unverified', 'non-rumor', 'true', 'false']

#### DATA PREPROCESSING
---

In [7]:
#data cleaning process
def clean_text(text_list):
    patterns = [
        r'\&\w*;',               #html special entities
        r'\$\w*',                #tickers
        r'https?:\/\/.*\/\w*',   #hyperlinks
        r'http(\S)+',            #url, rt, mentions@
        r'http ...', 
        r'(RT|rt)[ ]*@[ ]*[\S]+',
        r'RT[ ]?@',
        r'@[\S]+'
    ]
    
    html_entities = {
        r'&amp;?': 'and',
        r'&lt;': '<',
        r'&gt;': '>'
    } 

    redundancies = [
        r'\s\s+',    #excessive whitespace
        r'[ ]{2, }'  #more than 2 spaces
    ]
    cleaned_texts = [] 
    for text in text_list:

        #apply regex patterns
        for pattern in patterns:
            text = re.sub(pattern, '', text)

        #replace html entities   
        for entity, replacement in html_entities.items():
            text = re.sub(entity, replacement, text)
            
        #remove any redundant whitespace
        for redundancy in redundancies:
            text = re.sub(redundancy, ' ', text)
            
        #convert emoji to text
        text = emoji.demojize(text)

        cleaned_texts.append(text)

    return cleaned_texts

        

In [8]:
text_data['text'] = clean_text(text_data['text']) 
text_data

Unnamed: 0,ID,text,label
0,731166399389962242,:fire:ca kkk grand wizard :fire: endorses #nev...,unverified
1,714598641827246081,an open letter to trump voters from his top st...,unverified
2,691809004356501505,america is a nation of second chances — on new...,non-rumor
3,693204708933160960,"brandon marshall visits and offers advice, sup...",non-rumor
4,551099691702956032,rip elly may clampett: so sad to learn #beverl...,true
...,...,...,...
1485,692004901455556608,. just announced new reforms to address the ov...,non-rumor
1486,760109079133990912,“after school satan clubs”? URL,unverified
1487,500281131057811456,breaking news: according to documents released...,unverified
1488,523098334421319680,ebola vaccines? URL #news #today,false


In [9]:
#The data is balanced for the classification problem (i.e there is no class inbalance)
text_data['label'].value_counts()

label
unverified    374
non-rumor     374
true          372
false         370
Name: count, dtype: int64

#### FEATURE EXTRACTION + MODEL BUILDING
---

In [10]:
#import libraries 
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import LabelEncoder
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, cross_validate
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score
from sklearn.svm import SVC

In [11]:
X = text_data['text']
y = text_data.iloc[:, [-1]]

In [12]:
#Split data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state = 42)

In [13]:
#encode y_train
label_encoder = LabelEncoder()

#fit and transform y_train and y_test and save them as a 1d array
y_train_encoded = label_encoder.fit_transform(y_train.values.ravel())
y_test_encoded = label_encoder.fit_transform(y_test.values.ravel())

print(label_encoder.classes_)

['false' 'non-rumor' 'true' 'unverified']


##### SVM using a Linear Kernel
---

In [14]:
#building pipeline
SVM_clf = Pipeline([
                    ("tfidf", TfidfVectorizer(stop_words='english', lowercase=True)),
                    ("clf", SVC(kernel='linear'))
                ])

SVMParameters = {"clf__C": [0.001, 0.01, 0.1, 1, 10, 100]} #using a logarithmic scale

#perform grid search
grid_search = GridSearchCV(SVM_clf, SVMParameters, cv=5, scoring="accuracy")
grid_search.fit(X_train, y_train_encoded)

#output the best parameter and score
print("Best Parameters:", grid_search.best_params_["clf__C"])
print("Best accuracy:", grid_search.best_score_)

Best Parameters: 10
Best accuracy: 0.8212756232199994


In [15]:
#Use best model to predict on X_test
best_model = grid_search.best_estimator_
y_pred = best_model.predict(X_test)
print("Test accuracy:", accuracy_score(y_test_encoded, y_pred))

Test accuracy: 0.8288590604026845


##### SVM using a Non-Linear Kernel
---

In [16]:
#building pipeline
SVM_clf2 = Pipeline([
                    ("tfidf", TfidfVectorizer(stop_words='english', lowercase=True)),
                    ("clf", SVC(kernel='rbf'))
                ])

SVMParameters2 = {
                "clf__C": [0.001, 0.01, 0.1, 1, 10, 100],
                "clf__gamma": [0.0001, 0.001, 0.01, 0.1, 1, 10, 100]
                } #using a logarithmic scale

#perform grid search
grid_search2 = GridSearchCV(SVM_clf2, SVMParameters2, cv=5, scoring="accuracy")
grid_search2.fit(X_train, y_train_encoded)

#output the best parameter and score
print("Best Parameters:", grid_search2.best_params_)
print("Best accuracy:", grid_search2.best_score_)

Best Parameters: {'clf__C': 10, 'clf__gamma': 1}
Best accuracy: 0.8237895995218171


In [17]:
#Use best model to predict on X_test
best_model2 = grid_search2.best_estimator_
y_pred2 = best_model2.predict(X_test)
print("Test accuracy:", accuracy_score(y_test_encoded, y_pred2))

Test accuracy: 0.8288590604026845
