In [1]:
import json
import glob
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import random
import csv
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score
random.seed(42)
np.random.seed(42)

In [2]:
def remove_punctuations(x):
    punctuations = list(string.punctuation)
    punctuations.append('“')
    punctuations.append('”')
    punctuations.append("—")
    x_char = list(x)
    for i in range(len(x_char)):
        if x_char[i] in punctuations:
            x_char[i]=' '
    return ''.join(x_char)

def preprocess(X):
    X = X.str.lower()
    stop = stopwords.words('english')
    X = X.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    '''punctuations = list(string.punctuations)
    X = X.apply(lambda x: ''.join([word for word in list(x) if word not in (punctuations)]))
    '''
    X=X.apply(remove_punctuations)
    lemmatizer = WordNetLemmatizer()
    X = X.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in (stop)]))
    
    return X

def get_tf_idf_vector(X):
    vectorizer = TfidfVectorizer()
    X_tf_idf_vector = vectorizer.fit_transform(X)
    return vectorizer, X_tf_idf_vector

def get_tf_idf_vector_test(X,vectorizer):
    return vectorizer.transform(X)

def truncated_svd_on_tf_idf_vector(X):
    truncatedSVD = TruncatedSVD(n_components=1000,n_iter=7, random_state=42)
    X= truncatedSVD.fit_transform(X)
    return truncatedSVD, X

def truncated_svd_on_tf_idf_vector_test(X, truncatedSVD):
    return truncatedSVD.fit_transform(X)
    

## Using processed data

In [45]:
with open('./data/processed/dictionary.pickle','rb') as f:
    dictionary = pickle.load(f)

In [46]:
article_df = pd.DataFrame.from_dict(dictionary)

In [47]:
def return_list_of_dict(filename):
    train=[]
    with open('./data/processed/'+filename,'r') as f:
        for line in f:
            train.append(json.loads(line))
    return train

In [48]:
train = return_list_of_dict('train.jsonl')
val = return_list_of_dict('valid.jsonl')
test = return_list_of_dict('test.jsonl')

In [49]:
train_df = pd.DataFrame.from_dict(train)
val_df = pd.DataFrame.from_dict(val)
test_df = pd.DataFrame.from_dict(test)

In [50]:
train_df.head()

Unnamed: 0,mention,mention_id,context_left,context_right,context_doc_id,type,label_id,label,label_title
0,The families of these four should sit in the f...,780136889441456128,,,,,272,,
1,user person didnt lower my my taxes and he rai...,913075551212011520,,,,,273,,
2,JaredBerry316 glennbeck like when white Christ...,753045148775440384,,,,,274,,
3,AdamKazda 60 diplomats were killed on Bushs wa...,743830161393520640,,,,,14,,
4,fight4women Wow you got a source for all that ...,818188352314900480,,,,,275,,


In [51]:
X_train = train_df['mention']
X_val = val_df['mention']
X_test = test_df['mention']

In [52]:
X_train = preprocess(X_train)
X_test = preprocess(X_test)
X_val = preprocess(X_val)

In [53]:
y_train = train_df['label_id']
y_val = val_df['label_id']
y_test = test_df['label_id']

In [54]:
# Doing PCA here using truncated SVD as PCA does not work for sparse matrices
vectorizer, X_train_tf_idf_vector = get_tf_idf_vector(X_train)
#truncatedSVD, X_train_svd = truncated_svd_on_tf_idf_vector(X_train_tf_idf_vector)

X_val_tf_idf_vector = get_tf_idf_vector_test(X_val,vectorizer)
#X_val_svd = truncated_svd_on_tf_idf_vector_test(X_val_tf_idf_vector,truncatedSVD)

X_test_tf_idf_vector = get_tf_idf_vector_test(X_test,vectorizer)
#X_test_svd = truncated_svd_on_tf_idf_vector_test(X_test_tf_idf_vector, truncatedSVD)

In [57]:
# Training SVM
clf = make_pipeline(StandardScaler(with_mean=False), SVC(gamma='auto',class_weight='balanced'))
clf.fit(X_train_tf_idf_vector, y_train)
#clf = make_pipeline(StandardScaler(), SVC(gamma='auto'),class_weight='balanced')
#clf.fit(X_train_svd, y_train)

Pipeline(steps=[('standardscaler', StandardScaler(with_mean=False)),
                ('svc', SVC(class_weight='balanced', gamma='auto'))])

In [17]:
sum(y_test_pred==y_test)/y_test.shape[0]*100

14.129181084198386

In [19]:
correct=[]
incorrect=[]
for i in range(len(y_test)):
    if y_test[i]==y_test_pred[i]:
        correct.append(y_test[i])
    else:
        incorrect.append([y_test[i],y_test_pred[i]])

In [22]:
correct= pd.DataFrame(correc)

0.8587081891580162

In [38]:
y_test_pred_df=pd.DataFrame(np.array(y_test), columns=['true'])

In [39]:
y_test_pred_df['pred']=np.array(y_test_pred)

In [41]:
y_test_pred_df.to_csv('TFIDF_SVM_Predictions.csv')

In [7]:
y_test_pred_df = pd.read_csv('TFIDF_SVM_Predictions.csv',index_col=0)

In [10]:
y_test_pred_df

Unnamed: 0,true,pred
0,186,155
1,47,155
2,272,155
3,272,155
4,892,155
...,...,...
5197,1201,155
5198,1170,155
5199,1095,155
5200,2381,155


In [20]:
len(y_test_pred_df[y_test_pred_df['pred']==155])

4753

In [11]:
y_test_correct = y_test_pred_df[y_test_pred_df['true']==y_test_pred_df['pred']]
y_test_correct.head()

Unnamed: 0,true,pred
11,186,186
22,21,21
30,14,14
140,279,279
178,279,279


In [23]:
y_test_incorrect = y_test_pred_df[y_test_pred_df['true']!=y_test_pred_df['pred']]

In [44]:
y_test_incorrect.head()

Unnamed: 0,true,pred
0,186,155
1,47,155
2,272,155
3,272,155
4,892,155


In [24]:
len(y_test_correct[])

735

In [28]:
len(y_test_correct[y_test_correct['pred']==155])

359

In [25]:
len(y_test_incorrect)

4467

In [26]:
len(y_test_incorrect[y_test_incorrect['pred']==155])

4394

In [27]:
len(y_test_pred_df[y_test_pred_df['true']==155])

360

In [47]:
with open('./data/processed/dictionary.pickle','rb') as f:
    dictionary = pickle.load(f)
article_df = pd.DataFrame.from_dict(dictionary)

In [50]:
y_test_correct=y_test_correct.merge(article_df,left_on='true', right_on='cui')

In [51]:
y_test_correct

Unnamed: 0,true,pred,cui,title,description,summary
0,186,186,186,Hillary Clinton Kissed by Former Klan Member S...,,
1,186,186,186,Hillary Clinton Kissed by Former Klan Member S...,,
2,186,186,186,Hillary Clinton Kissed by Former Klan Member S...,,
3,186,186,186,Hillary Clinton Kissed by Former Klan Member S...,,
4,186,186,186,Hillary Clinton Kissed by Former Klan Member S...,,
...,...,...,...,...,...,...
730,1138,1138,1138,Was DNC Worker Seth Conrad Rich Gunned Down on...,,
731,192,192,192,Is Barack Obama039s Birth Certificate Fake Sno...,,
732,1120,1120,1120,Is Comet Ping Pong Pizzeria Home to a Child Ab...,,
733,1321,1321,1321,PolitiFact For the third time Donald Trump US ...,Donald Trump released a tax plan in September ...,Donald Trump said this week hes considering ra...


In [52]:
y_test_incorrect=y_test_incorrect.merge(article_df,left_on='true', right_on='cui')

In [53]:
y_test_incorrect=y_test_incorrect.merge(article_df,left_on='pred', right_on='cui')

In [56]:
y_test_incorrect.head()

Unnamed: 0,true,pred,cui_x,title_x,description_x,summary_x,cui_y,title_y,description_y,summary_y
0,186,155,186,Hillary Clinton Kissed by Former Klan Member S...,,,155,FACT CHECK Hillary Clinton Gave 20 Percent of ...,,
1,186,155,186,Hillary Clinton Kissed by Former Klan Member S...,,,155,FACT CHECK Hillary Clinton Gave 20 Percent of ...,,
2,186,155,186,Hillary Clinton Kissed by Former Klan Member S...,,,155,FACT CHECK Hillary Clinton Gave 20 Percent of ...,,
3,186,155,186,Hillary Clinton Kissed by Former Klan Member S...,,,155,FACT CHECK Hillary Clinton Gave 20 Percent of ...,,
4,186,155,186,Hillary Clinton Kissed by Former Klan Member S...,,,155,FACT CHECK Hillary Clinton Gave 20 Percent of ...,,


In [63]:
y_test_incorrect = y_test_incorrect.rename(columns={"description_x":"true_description","summary_x":"true_summary","description_y":"pred_description","summary_y":"pred_summary","title_x":"true_title","title_y":"pred_title"}) 

In [64]:
y_test_incorrect.head()

Unnamed: 0,true,pred,cui_x,true_title,true_description,true_summary,cui_y,pred_title,pred_description,pred_summary
0,186,155,186,Hillary Clinton Kissed by Former Klan Member S...,,,155,FACT CHECK Hillary Clinton Gave 20 Percent of ...,,
1,186,155,186,Hillary Clinton Kissed by Former Klan Member S...,,,155,FACT CHECK Hillary Clinton Gave 20 Percent of ...,,
2,186,155,186,Hillary Clinton Kissed by Former Klan Member S...,,,155,FACT CHECK Hillary Clinton Gave 20 Percent of ...,,
3,186,155,186,Hillary Clinton Kissed by Former Klan Member S...,,,155,FACT CHECK Hillary Clinton Gave 20 Percent of ...,,
4,186,155,186,Hillary Clinton Kissed by Former Klan Member S...,,,155,FACT CHECK Hillary Clinton Gave 20 Percent of ...,,


In [65]:
y_test_incorrect.to_csv('TF-IDF_SVM_CorrectPredictions.csv')

In [67]:
y_test_correct.to_csv('TF-IDF_SVM_IncorrectPredictions.csv')

In [68]:
len(y_test_correct)

735

In [3]:
y_test_pred_df = pd.read_csv('TFIDF_SVM_Predictions.csv')

In [6]:
f1_score(y_test_pred_df['true'],y_test_pred_df['pred'], average='macro')

0.03639252366974252

In [5]:
f1_score(y_test_pred_df['true'],y_test_pred_df['pred'], average='micro')

0.14129181084198386