In [126]:
# Load contractions model
from pycontractions import Contractions

cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

In [257]:
import pandas as pd
import numpy as np
import csv
import re
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn import metrics

from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

### Load Data

In [258]:
cluster_name = 'c1'
file = 'Data/'+cluster_name+'.csv'
df = pd.read_csv(file,sep=",")
df = df.loc[df['rating'] != 'OTHER']
df = df.loc[df['organization'] != 'snopes']

### Preprocess

In [259]:
def preprocess_claim(sentence):
    sentence = sentence.lower().strip()
    translator = str.maketrans('’', "'", '')
    sentence = sentence.translate(translator)
    sentence = re.sub("u\.s\.","united states",sentence)
    sentence = list(cont.expand_texts([sentence],precise=True))[0]
    sentence = re.sub("[^a-zA-Z0-9_.’,]|(?<!\d)\.(?!\d)|(?<!\w)-(?!\w)|(?<!\d)\,(?!\d)",' ',sentence)
    sentence = re.sub(",",'',sentence)
    sentence = re.sub("\.",'',sentence)
    sentence = re.sub(" a ",' ',sentence)
    sentence = re.sub('\s+', ' ', sentence).strip()
    sentence = re.sub(" s ",' ',sentence)
    if sentence[0:5] == 'says ':
        sentence = sentence[5:]
    sentence = ' '.join([w for w in sentence.split() if len(w)>1])
    return sentence

def preprocess_df(df):
    for index in df.index:
        df.at[index,'text'] = preprocess_claim(df.at[index,'text'])
        rating = df.at[index,'rating']
        if rating == 'FALSE':
            df.at[index,'rating'] = 0
        if rating == 'TRUE':
            df.at[index,'rating'] = 1
        if rating == 'MIXTURE':
            df.at[index,'rating'] = 2
    return df

In [260]:
df = preprocess_df(df)

### Add authors to DF

In [261]:
# Read authors
authors_dict = dict()

with open('Data/authors_classified.csv', 'r', newline='', encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row.
    for row in reader:
        authors_dict[row[0]] = row[1]

In [262]:
# Delete unclassified rows
ix=[i for i in df.index if ((df.at[i,'author'] in authors_dict.keys()))]
df = df.loc[ix]
df = df.reset_index(drop=True)

In [263]:
def add_type_columns(df):
    df['person'] = 0
    df['democrat'] = 0
    df['republican'] = 0
    df['political'] = 0
    df['journalist'] = 0
    df['organization'] = 0
    for i in df.index:
        author_type = authors_dict[df.at[i,'author']]
        if author_type == 'Person':
            df.at[i,'person'] = 1
        if author_type == 'Democrat':
            df.at[i,'democrat'] = 1
        if author_type == 'Republican':
            df.at[i,'republican'] = 1
        if author_type == 'Political':
            df.at[i,'political'] = 1
        if author_type == 'Journalist':
            df.at[i,'journalist'] = 1
        if author_type == 'Organization':
            df.at[i,'organization'] = 1
    return df

In [264]:
df = add_type_columns(df)

### Get credibility vectors

In [265]:
# Get vectors
columns = ['person','democrat','republican','political','journalist','organization']
ratings = sorted(df['rating'].unique())

cred_vectors = {key:{rating:0 for rating in ratings} for key in columns}

for i in df.index:
    for column in columns:
        if df.at[i, column] == 1:
            cred_vectors[column][df.at[i, 'rating']] = cred_vectors[column][df.at[i, 'rating']] + 1 

In [266]:
#Normalize vectors
norm_cred_vectors = dict()

for key in cred_vectors:
    norm_cred_vectors[key] = dict()
    total = sum(cred_vectors[key].values()) + len(cred_vectors[key])
    for rating in cred_vectors[key]:
        norm_cred_vectors[key][rating] = (cred_vectors[key][rating] + 1) / total 

In [267]:
def vectorize(df, cred_vectors, columns):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    X = vectorizer.fit_transform(df['text']).toarray()
    df_vectors = pd.DataFrame(X, columns = vectorizer.get_feature_names())
    
    for column in columns:
        df_vectors[column] = 0
    
    for key in cred_vectors[columns[0]]:
        df_vectors[key] = 0.0
    
    for i in df.index:
        for column in columns:
            if df.at[i,column] == 1:
                df_vectors.at[i,column] = 1
                for key in cred_vectors[column]:
                    df_vectors.at[i,key] = cred_vectors[column][key]             
    return df_vectors.to_numpy()
    #return df_vectors

In [268]:
ngram = (1,3)
mindf = 2

In [269]:
def classify(df,cred_vectors,columns,key):
    X = vectorize(df,cred_vectors,columns)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (('%s with authors'%key),score.mean(),score.std()))
    return score.mean()

In [270]:
iterations = 10

scores = list()
for i in range(iterations):
    scores.append(classify(df,norm_cred_vectors,columns,'3 Classes'))

avg = sum(scores)/len(scores)
print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes with authors - Mean accuracy: 0.554184, Deviation: 0.044829
3 Classes with authors - Mean accuracy: 0.538721, Deviation: 0.035836
3 Classes with authors - Mean accuracy: 0.556608, Deviation: 0.065547
3 Classes with authors - Mean accuracy: 0.543598, Deviation: 0.037549
3 Classes with authors - Mean accuracy: 0.545848, Deviation: 0.016044
3 Classes with authors - Mean accuracy: 0.560295, Deviation: 0.036710
3 Classes with authors - Mean accuracy: 0.540970, Deviation: 0.045749
3 Classes with authors - Mean accuracy: 0.537345, Deviation: 0.042215
3 Classes with authors - Mean accuracy: 0.555448, Deviation: 0.048902
3 Classes with authors - Mean accuracy: 0.534987, Deviation: 0.036606
Average of 10 runs: 0.546800



### Cred vectors only

In [271]:
def vectorize2(df, cred_vectors, columns):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    X = vectorizer.fit_transform(df['text']).toarray()
    df_vectors = pd.DataFrame(X, columns = vectorizer.get_feature_names())
    
    for column in columns:
        df_vectors[column] = 0
    
    for key in cred_vectors[columns[0]]:
        df_vectors[key] = 0.0
    
    for i in df.index:
        for column in columns:
            if df.at[i,column] == 1:
                df_vectors.at[i,column] = 1
                for key in cred_vectors[column]:
                    df_vectors.at[i,key] = cred_vectors[column][key]
    
    columns2 = columns.copy()
    columns2.append(0)
    columns2.append(1)
    columns2.append(2)
    df2 = df_vectors[columns2].copy()
    return df2.to_numpy()

In [272]:
def classify2(df,cred_vectors,columns,key):
    X = vectorize2(df,cred_vectors,columns)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (('%s with authors'%key),score.mean(),score.std()))
    return score.mean()

In [273]:
iterations = 10

scores = list()
for i in range(iterations):
    scores.append(classify2(df,norm_cred_vectors,columns,'3 Classes'))

avg = sum(scores)/len(scores)
print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes with authors - Mean accuracy: 0.583248, Deviation: 0.011876
3 Classes with authors - Mean accuracy: 0.583190, Deviation: 0.010137
3 Classes with authors - Mean accuracy: 0.578284, Deviation: 0.011880
3 Classes with authors - Mean accuracy: 0.583234, Deviation: 0.012309
3 Classes with authors - Mean accuracy: 0.583248, Deviation: 0.013015
3 Classes with authors - Mean accuracy: 0.583190, Deviation: 0.006579
3 Classes with authors - Mean accuracy: 0.583234, Deviation: 0.012309
3 Classes with authors - Mean accuracy: 0.583219, Deviation: 0.011569
3 Classes with authors - Mean accuracy: 0.583205, Deviation: 0.010925
3 Classes with authors - Mean accuracy: 0.583176, Deviation: 0.009130
Average of 10 runs: 0.582723

