In [22]:
import json
import glob
import pandas as pd
import pickle
from sklearn.model_selection import train_test_split
from nltk.corpus import stopwords
import string
from nltk.stem import WordNetLemmatizer
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import SVC
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
import random
import csv
from sklearn.decomposition import PCA
from sklearn.decomposition import TruncatedSVD
from sklearn.metrics import f1_score
random.seed(42)
np.random.seed(42)

In [2]:
def remove_punctuations(x):
    punctuations = list(string.punctuation)
    punctuations.append('“')
    punctuations.append('”')
    punctuations.append("—")
    x_char = list(x)
    for i in range(len(x_char)):
        if x_char[i] in punctuations:
            x_char[i]=' '
    return ''.join(x_char)

def preprocess(X):
    X = X.str.lower()
    stop = stopwords.words('english')
    X = X.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
    '''punctuations = list(string.punctuations)
    X = X.apply(lambda x: ''.join([word for word in list(x) if word not in (punctuations)]))
    '''
    X=X.apply(remove_punctuations)
    lemmatizer = WordNetLemmatizer()
    X = X.apply(lambda x: ' '.join([lemmatizer.lemmatize(word) for word in x.split() if word not in (stop)]))
    
    return X

def get_tf_idf_vector(X):
    vectorizer = TfidfVectorizer()
    X_tf_idf_vector = vectorizer.fit_transform(X)
    return vectorizer, X_tf_idf_vector

def get_tf_idf_vector_test(X,vectorizer):
    return vectorizer.transform(X)

def truncated_svd_on_tf_idf_vector(X):
    truncatedSVD = TruncatedSVD(n_components=1000,n_iter=7, random_state=42)
    X= truncatedSVD.fit_transform(X)
    return truncatedSVD, X

def truncated_svd_on_tf_idf_vector_test(X, truncatedSVD):
    return truncatedSVD.fit_transform(X)
    

## Using processed data

In [3]:
with open('./data/processed/dictionary.pickle','rb') as f:
    dictionary = pickle.load(f)

In [4]:
article_df = pd.DataFrame.from_dict(dictionary)

In [5]:
def return_list_of_dict(filename):
    train=[]
    with open('./data/processed/'+filename,'r') as f:
        for line in f:
            train.append(json.loads(line))
    return train

In [6]:
train = return_list_of_dict('train.jsonl')
val = return_list_of_dict('valid.jsonl')
test = return_list_of_dict('test.jsonl')

In [7]:
train_df = pd.DataFrame.from_dict(train)
val_df = pd.DataFrame.from_dict(val)
test_df = pd.DataFrame.from_dict(test)

In [8]:
train_df.head()

Unnamed: 0,mention,mention_id,context_left,context_right,context_doc_id,type,label_id,label,label_title
0,The families of these four should sit in the f...,780136889441456128,,,,,272,,
1,user person didnt lower my my taxes and he rai...,913075551212011520,,,,,273,,
2,JaredBerry316 glennbeck like when white Christ...,753045148775440384,,,,,274,,
3,AdamKazda 60 diplomats were killed on Bushs wa...,743830161393520640,,,,,14,,
4,fight4women Wow you got a source for all that ...,818188352314900480,,,,,275,,


In [9]:
X_train = train_df['mention']
X_val = val_df['mention']
X_test = test_df['mention']

In [10]:
y_train = train_df['label_id']
y_val = val_df['label_id']
y_test = test_df['label_id']

In [11]:
train_article_df = train_df.merge(article_df,left_on='label_id', right_on='cui')

In [12]:
only_articles_in_train = train_article_df[['cui', 'title', 'description','summary']]

In [13]:
only_articles_in_train = only_articles_in_train.drop_duplicates()

In [14]:
def add_column_strings(df, col1, col2):
    return_val = []
    for i in range(len(df)):
        return_val.append(df[col1].iloc[i]+df[col2].iloc[i])
    return return_val

## Classification using full length articles

In [15]:
X_train_tweets = train_df['mention']
X_train_articles = pd.Series(add_column_strings(only_articles_in_train, 'title', 'description'))
frames = [X_train_tweets, X_train_articles]
X_train = pd.concat(frames)

In [16]:
X_train = preprocess(X_train)
X_test = preprocess(X_test)
X_val = preprocess(X_val)

In [17]:
# Doing PCA here using truncated SVD as PCA does not work for sparse matrices
vectorizer, X_train_tf_idf_vector = get_tf_idf_vector(X_train)
#truncatedSVD, X_train_svd = truncated_svd_on_tf_idf_vector(X_train_tf_idf_vector)

X_val_tf_idf_vector = get_tf_idf_vector_test(X_val,vectorizer)
#X_val_svd = truncated_svd_on_tf_idf_vector_test(X_val_tf_idf_vector,truncatedSVD)

X_test_tf_idf_vector = get_tf_idf_vector_test(X_test,vectorizer)
#X_test_svd = truncated_svd_on_tf_idf_vector_test(X_test_tf_idf_vector, truncatedSVD)

In [18]:
article_tf_idf = X_train_tf_idf_vector[-3100:]
article_labels = only_articles_in_train['cui']

In [21]:
article_tf_idf.shape

(3100, 76693)

In [19]:
correct = 0
total = 0 
correctly_classified=[]
incorrectly_classified=[]
incorrectly_predicted_label=[]
y_pred_lst=[]
for i in range(X_test_tf_idf_vector.shape[0]):
    dot_product = np.dot(X_test_tf_idf_vector[i], article_tf_idf.T)
    argmax = np.argmax(dot_product)
    y_pred = article_labels.iloc[argmax]
    y_pred_lst.append(y_pred)
    if y_test[i] == y_pred:
        correct+=1
        correctly_classified.append(test_df.iloc[i])
    else:
        incorrectly_classified.append(test_df.iloc[i])
        incorrectly_predicted_label.append(y_pred)
    total+=1
print("Accuracy=",correct*100/total)

Accuracy= 12.053056516724336


In [36]:
f1_score(list(y_test),y_pred_lst, average='micro')

0.12053056516724336

In [37]:
f1_score(list(y_test),y_pred_lst, average='macro')

0.07625551929210807

In [20]:
def error_eval_full_articles(correctly_classified, incorrectly_classified, incorrectly_predicted_label):
    correctly_classified_df = pd.DataFrame(correctly_classified)
    correctly_classified_df = correctly_classified_df.merge(article_df,left_on='label_id', right_on='cui')
    correctly_classified_df=correctly_classified_df.drop(['context_left','context_right','context_doc_id','type','label','label_title','summary'],axis=1)
    
    incorrectly_classified_df = pd.DataFrame(incorrectly_classified)
    incorrectly_classified_df['pred_label'] = incorrectly_predicted_label
    incorrectly_classified_df = incorrectly_classified_df.merge(article_df,left_on='label_id', right_on='cui')
    incorrect_pred = pd.DataFrame(incorrectly_predicted_label,columns=['label_id']).merge(article_df,left_on='label_id', right_on='cui')
    print(incorrect_pred.head())
    print(incorrectly_classified_df.shape)
    incorrectly_classified_df = incorrectly_classified_df.merge(incorrect_pred.drop_duplicates(),how='left',left_on='pred_label', right_on='label_id')
    print(incorrectly_classified_df.shape)
    incorrectly_classified_df = incorrectly_classified_df.drop(['context_left','context_right','context_doc_id','type','label','label_title','cui_x','label_id_y','cui_y','summary_x','summary_y'],axis=1)
    incorrectly_classified_df= incorrectly_classified_df.rename(columns={"label_id_x":"true_label","title_x":"true_title","description_x":"true_description","title_y":"pred_title","description_y":"pred_description"})
    return correctly_classified_df, incorrectly_classified_df


In [24]:
correctly_classified_df, incorrectly_classified_df=error_eval_full_articles(correctly_classified, incorrectly_classified, incorrectly_predicted_label)

  label_id  cui                                        title description  \
0      183  183  Gun Deaths vs Baseball Bat Deaths Snopescom               
1      183  183  Gun Deaths vs Baseball Bat Deaths Snopescom               
2      183  183  Gun Deaths vs Baseball Bat Deaths Snopescom               
3      183  183  Gun Deaths vs Baseball Bat Deaths Snopescom               
4      183  183  Gun Deaths vs Baseball Bat Deaths Snopescom               

  summary  
0          
1          
2          
3          
4          
(4575, 14)
(4575, 19)


In [25]:
incorrectly_classified_df.to_csv('TF-IDF_kNN_IncorrectPredictionsUsingFullArticles.csv')

In [26]:
correctly_classified_df.to_csv('TF-IDF_kNN_CorrectPredictionsUsingFullArticles.csv')

## Classification using summaries

In [38]:
X_train_tweets = train_df['mention']
X_train_articles = pd.Series(add_column_strings(only_articles_in_train, 'title', 'summary'))
frames = [X_train_tweets, X_train_articles]
X_train = pd.concat(frames)

In [39]:
X_train = preprocess(X_train)
X_test = preprocess(X_test)
X_val = preprocess(X_val)

In [40]:
# Doing PCA here using truncated SVD as PCA does not work for sparse matrices
vectorizer, X_train_tf_idf_vector = get_tf_idf_vector(X_train)
#truncatedSVD, X_train_svd = truncated_svd_on_tf_idf_vector(X_train_tf_idf_vector)

X_val_tf_idf_vector = get_tf_idf_vector_test(X_val,vectorizer)
#X_val_svd = truncated_svd_on_tf_idf_vector_test(X_val_tf_idf_vector,truncatedSVD)

X_test_tf_idf_vector = get_tf_idf_vector_test(X_test,vectorizer)
#X_test_svd = truncated_svd_on_tf_idf_vector_test(X_test_tf_idf_vector, truncatedSVD)

In [41]:
summary_tf_idf = X_train_tf_idf_vector[-3100:]
summary_labels = only_articles_in_train['cui']

In [44]:
correct = 0
total = 0 
correct_random = 0
correctly_classified=[]
incorrectly_classified=[]
incorrectly_predicted_label=[]
y_pred_lst=[]
for i in range(X_test_tf_idf_vector.shape[0]):
    dot_product = np.dot(X_test_tf_idf_vector[i], summary_tf_idf.T)
    argmax = np.argmax(dot_product)
    y_pred = summary_labels.iloc[argmax]
    random_argmax = random.randint(0,3099)
    y_pred_random = summary_labels.iloc[random_argmax]
    y_pred_lst.append(y_pred)
    if y_test[i] == y_pred:
        correct+=1
        correctly_classified.append(test_df.iloc[i])
    else:
        incorrectly_classified.append(test_df.iloc[i])
        incorrectly_predicted_label.append(y_pred)
    if y_test[i] == y_pred_random:
        correct_random +=1
    total+=1
print("Random accuracy=",correct_random*100/total)
print("Real accuracy=",correct*100/total)

Random accuracy= 0.0
Real accuracy= 12.245290272971934


In [45]:
f1_score(list(y_test),y_pred_lst, average='micro')

0.12245290272971934

In [46]:
f1_score(list(y_test),y_pred_lst, average='macro')

0.07512497365764031

In [32]:
def error_eval_summary(correctly_classified, incorrectly_classified, incorrectly_predicted_label):
    correctly_classified_df = pd.DataFrame(correctly_classified)
    correctly_classified_df = correctly_classified_df.merge(article_df,left_on='label_id', right_on='cui')
    correctly_classified_df=correctly_classified_df.drop(['context_left','context_right','context_doc_id','type','label','label_title','description'],axis=1)
    
    incorrectly_classified_df = pd.DataFrame(incorrectly_classified)
    incorrectly_classified_df['pred_label'] = incorrectly_predicted_label
    incorrectly_classified_df = incorrectly_classified_df.merge(article_df,left_on='label_id', right_on='cui')
    incorrect_pred = pd.DataFrame(incorrectly_predicted_label,columns=['label_id']).merge(article_df,left_on='label_id', right_on='cui')
    incorrectly_classified_df = incorrectly_classified_df.merge(incorrect_pred.drop_duplicates(),left_on='pred_label', right_on='cui')
    incorrectly_classified_df = incorrectly_classified_df.drop(['context_left','context_right','context_doc_id','type','label','label_title','cui_x','label_id_y','cui_y','description_x','description_y'],axis=1)
    incorrectly_classified_df= incorrectly_classified_df.rename(columns={"label_id_x":"true_label","title_x":"true_title","summary_x":"true_summary","title_y":"pred_title","summary_y":"pred_summary"})
    return correctly_classified_df, incorrectly_classified_df


In [33]:
 correctly_classified_df, incorrectly_classified_df = error_eval_summary(correctly_classified, incorrectly_classified, incorrectly_predicted_label)

In [36]:
incorrectly_classified_df.to_csv('TF-IDF_kNN_IncorrectPredictionsUsingSummaries.csv')
correctly_classified_df.to_csv('TF-IDF_kNN_CorrectPredictionsUsingSummaries.csv')