In [1]:
# Load contractions model
from pycontractions import Contractions

cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

In [2]:
import pandas as pd
import numpy as np
import csv
import re
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn import metrics

from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split

### Load Data

In [3]:
cluster_name = 'c1'
file = 'Data/'+cluster_name+'.csv'
df = pd.read_csv(file,sep=",")
df = df.loc[df['rating'] != 'OTHER']
df = df.loc[df['organization'] != 'snopes']

### Preprocess

In [4]:
def preprocess_claim(sentence):
    sentence = sentence.lower().strip()
    translator = str.maketrans('’', "'", '')
    sentence = sentence.translate(translator)
    sentence = re.sub("u\.s\.","united states",sentence)
    sentence = list(cont.expand_texts([sentence],precise=True))[0]
    sentence = re.sub("[^a-zA-Z0-9_.’,]|(?<!\d)\.(?!\d)|(?<!\w)-(?!\w)|(?<!\d)\,(?!\d)",' ',sentence)
    sentence = re.sub(",",'',sentence)
    sentence = re.sub("\.",'',sentence)
    sentence = re.sub(" a ",' ',sentence)
    sentence = re.sub('\s+', ' ', sentence).strip()
    sentence = re.sub(" s ",' ',sentence)
    if sentence[0:5] == 'says ':
        sentence = sentence[5:]
    sentence = ' '.join([w for w in sentence.split() if len(w)>1])
    return sentence

def preprocess_df(df):
    for index in df.index:
        df.at[index,'text'] = preprocess_claim(df.at[index,'text'])
        rating = df.at[index,'rating']
        if rating == 'FALSE':
            df.at[index,'rating'] = 0
        if rating == 'TRUE':
            df.at[index,'rating'] = 1
        if rating == 'MIXTURE':
            df.at[index,'rating'] = 2
    return df

In [5]:
df = preprocess_df(df)

### Add authors to DF

In [6]:
# Read authors
authors_dict = dict()

with open('Data/authors_classified.csv', 'r', newline='', encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row.
    for row in reader:
        authors_dict[row[0]] = row[1]

In [7]:
# Delete unclassified rows
ix=[i for i in df.index if ((df.at[i,'author'] in authors_dict.keys()))]
df = df.loc[ix]
df = df.reset_index(drop=True)

In [8]:
def add_type_columns(df):
    df['person'] = 0
    df['democrat'] = 0
    df['republican'] = 0
    df['political'] = 0
    df['journalist'] = 0
    df['organization'] = 0
    for i in df.index:
        author_type = authors_dict[df.at[i,'author']]
        if author_type == 'Person':
            df.at[i,'person'] = 1
        if author_type == 'Democrat':
            df.at[i,'democrat'] = 1
        if author_type == 'Republican':
            df.at[i,'republican'] = 1
        if author_type == 'Political':
            df.at[i,'political'] = 1
        if author_type == 'Journalist':
            df.at[i,'journalist'] = 1
        if author_type == 'Organization':
            df.at[i,'organization'] = 1
    return df

In [9]:
df = add_type_columns(df)

### Get credibility vectors

In [10]:
def get_cred_vectors(df, columns):
    ratings = sorted(df['rating'].unique())
    cred_vectors = {key:{rating:0 for rating in ratings} for key in columns}

    for i in df.index:
        for column in columns:
            if df.at[i, column] == 1:
                cred_vectors[column][df.at[i, 'rating']] = cred_vectors[column][df.at[i, 'rating']] + 1
    return cred_vectors

In [11]:
def normalize_cred_vectors(cred_vectors):
    norm_cred_vectors = dict()

    for key in cred_vectors:
        norm_cred_vectors[key] = dict()
        total = sum(cred_vectors[key].values()) + len(cred_vectors[key])
        for rating in cred_vectors[key]:
            norm_cred_vectors[key][rating] = (cred_vectors[key][rating] + 1) / total
    return norm_cred_vectors

In [12]:
columns = ['person','democrat','republican','political','journalist','organization']
norm_cred_vectors = normalize_cred_vectors(get_cred_vectors(df,columns))

In [13]:
def vectorize(df, cred_vectors, columns):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    X = vectorizer.fit_transform(df['text']).toarray()
    df_vectors = pd.DataFrame(X, columns = vectorizer.get_feature_names())
    
    for column in columns:
        df_vectors[column] = 0
    
    for key in cred_vectors[columns[0]]:
        df_vectors[key] = 0.0
    
    for i in df.index:
        for column in columns:
            if df.at[i,column] == 1:
                df_vectors.at[i,column] = 1
                for key in cred_vectors[column]:
                    df_vectors.at[i,key] = cred_vectors[column][key]             
    return df_vectors.to_numpy()
    #return df_vectors

In [14]:
ngram = (1,3)
mindf = 2

In [15]:
def classify(df,cred_vectors,columns,key):
    X = vectorize(df,cred_vectors,columns)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (('%s with authors'%key),score.mean(),score.std()))
    return score.mean()

In [16]:
iterations = 10

scores = list()
for i in range(iterations):
    scores.append(classify(df,norm_cred_vectors,columns,'3 Classes'))

avg = sum(scores)/len(scores)
print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes with authors - Mean accuracy: 0.547066, Deviation: 0.038350
3 Classes with authors - Mean accuracy: 0.550814, Deviation: 0.043150
3 Classes with authors - Mean accuracy: 0.558845, Deviation: 0.047979
3 Classes with authors - Mean accuracy: 0.557812, Deviation: 0.029218
3 Classes with authors - Mean accuracy: 0.538719, Deviation: 0.045125
3 Classes with authors - Mean accuracy: 0.557715, Deviation: 0.059912
3 Classes with authors - Mean accuracy: 0.555430, Deviation: 0.033169
3 Classes with authors - Mean accuracy: 0.550382, Deviation: 0.038754
3 Classes with authors - Mean accuracy: 0.538778, Deviation: 0.027734
3 Classes with authors - Mean accuracy: 0.554243, Deviation: 0.026126
Average of 10 runs: 0.550980



### Cred vectors only

In [17]:
def vectorize2(df, cred_vectors, columns):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    X = vectorizer.fit_transform(df['text']).toarray()
    df_vectors = pd.DataFrame(X, columns = vectorizer.get_feature_names())
    
    for column in columns:
        df_vectors[column] = 0
    
    for key in cred_vectors[columns[0]]:
        df_vectors[key] = 0.0
    
    for i in df.index:
        for column in columns:
            if df.at[i,column] == 1:
                df_vectors.at[i,column] = 1
                for key in cred_vectors[column]:
                    df_vectors.at[i,key] = cred_vectors[column][key]
    
    columns2 = columns.copy()
    columns2.append(0)
    columns2.append(1)
    columns2.append(2)
    df2 = df_vectors[columns2].copy()
    return df2.to_numpy()

In [18]:
def classify2(df,cred_vectors,columns,key):
    X = vectorize2(df,cred_vectors,columns)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (('%s with authors'%key),score.mean(),score.std()))
    return score.mean()

In [19]:
iterations = 10

scores = list()
for i in range(iterations):
    scores.append(classify2(df,norm_cred_vectors,columns,'3 Classes'))

avg = sum(scores)/len(scores)
print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes with authors - Mean accuracy: 0.583219, Deviation: 0.010346
3 Classes with authors - Mean accuracy: 0.583190, Deviation: 0.006579
3 Classes with authors - Mean accuracy: 0.583176, Deviation: 0.009213
3 Classes with authors - Mean accuracy: 0.583248, Deviation: 0.013015
3 Classes with authors - Mean accuracy: 0.583263, Deviation: 0.012499
3 Classes with authors - Mean accuracy: 0.583219, Deviation: 0.010346
3 Classes with authors - Mean accuracy: 0.583190, Deviation: 0.008455
3 Classes with authors - Mean accuracy: 0.583277, Deviation: 0.014261
3 Classes with authors - Mean accuracy: 0.583190, Deviation: 0.008463
3 Classes with authors - Mean accuracy: 0.583234, Deviation: 0.012309
Average of 10 runs: 0.583221



### Data Split

In [20]:
X_train, X_test, y_train, y_test = train_test_split(df, df["rating"],test_size=0.3, random_state=0, 
                                                    stratify = df["rating"])

X_train = X_train.reset_index(drop=True)
y_train = y_train.reset_index(drop=True)
X_test = X_test.reset_index(drop=True)
y_test = y_test.reset_index(drop=True)

### Baseline Accuracy

In [21]:
# Train
vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
X = vectorizer.fit_transform(X_train['text']).toarray()
y = y_train.astype('int')

clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X,y)

#Predict
X = vectorizer.transform(X_test['text']).toarray()
y = y_test.astype('int')
s = clf.score(X,y)

print("Baseline accuracy is %f" % s)

Baseline accuracy is 0.522088


### TF IDF + Vectors Accuracy

In [22]:
#Get vectors
v = normalize_cred_vectors(get_cred_vectors(X_train,columns))

In [23]:
def vectorize3(df, cred_vectors, columns, vectorizer, fit=True):
    if fit == True:
        X = vectorizer.fit_transform(df['text']).toarray()
    else:
        X = vectorizer.transform(df['text']).toarray()
    
    df_vectors = pd.DataFrame(X, columns = vectorizer.get_feature_names())
    
    for column in columns:
        df_vectors[column] = 0
    
    for key in cred_vectors[columns[0]]:
        df_vectors[key] = 0.0
    
    for i in df.index:
        for column in columns:
            if df.at[i,column] == 1:
                df_vectors.at[i,column] = 1
                for key in cred_vectors[column]:
                    df_vectors.at[i,key] = cred_vectors[column][key]             
    return df_vectors.to_numpy()

In [24]:
#Train
vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
X = vectorize3(X_train, v, columns, vectorizer, fit=True)
y = y_train.astype('int')

clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X,y)

#Predict
X = vectorize3(X_test, v, columns, vectorizer, fit=False)
y = y_test.astype('int')
s = clf.score(X,y)

print("TF-IDF + Vectors accuracy is %f" % s)

TF-IDF + Vectors accuracy is 0.546185


### Vectors only accuracy

In [25]:
def vectorize4(df, cred_vectors, columns):
    df_vectors = pd.DataFrame(columns = columns)
    
    for column in columns:
        df_vectors[column] = 0
    
    for key in cred_vectors[columns[0]]:
        df_vectors[key] = 0.0
    
    for i in df.index:
        for column in columns:
            if df.at[i,column] == 1:
                for key in cred_vectors[column]:
                    df_vectors.at[i,key] = cred_vectors[column][key]
    
    columns2 = [0,1,2]
    df2 = df_vectors[columns2].copy()
    return df2.to_numpy()

In [26]:
#Train
X = vectorize4(X_train, v, columns)
y = y_train.astype('int')

clf = LinearSVC(random_state=0, tol=1e-5)
clf.fit(X,y)

#Predict
X = vectorize4(X_test, v, columns)
y = y_test.astype('int')
s = clf.score(X,y)

print("Vectors only accuracy is %f" % s)

Vectors only accuracy is 0.578313
