In [1]:
# Load contractions model
from pycontractions import Contractions

cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

In [24]:
import pandas as pd
import numpy as np
import csv
import re
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

### Load Data

In [46]:
file = 'Data/c3.csv'
df = pd.read_csv(file,sep=",")
df = df.loc[df['rating'] != 'OTHER']

In [47]:
authors_dict = dict()

with open('Data/authors_classified.csv', 'r', newline='') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row.
    for row in reader:
        authors_dict[row[0]] = row[1]

In [48]:
df.shape
#990 with snopes

(1410, 7)

In [49]:
df = df.loc[df['organization'] != 'snopes']
df.shape
#949 without snopes

(1355, 7)

In [50]:
ix=[i for i in df.index if ((df.at[i,'author'] in authors_dict.keys()))]
df = df.loc[ix]
df.shape
#827 claims with classified authors

(1221, 7)

In [51]:
df = df.reset_index(drop=True)

### Add author type to df

In [52]:
def add_type_column(df):
    df['type'] = ''
    for i in df.index:
        df.at[i,'type'] = authors_dict[df.at[i,'author']]

In [53]:
add_type_column(df)

### Preprocess DF

In [54]:
def preprocess_claim(sentence):
    sentence = sentence.lower().strip()
    translator = str.maketrans('’', "'", '')
    sentence = sentence.translate(translator)
    sentence = re.sub("u\.s\.","united states",sentence)
    sentence = list(cont.expand_texts([sentence],precise=True))[0]
    sentence = re.sub("[^a-zA-Z0-9_.’,]|(?<!\d)\.(?!\d)|(?<!\w)-(?!\w)|(?<!\d)\,(?!\d)",' ',sentence)
    sentence = re.sub(",",'',sentence)
    sentence = re.sub("\.",'',sentence)
    sentence = re.sub(" a ",' ',sentence)
    sentence = re.sub('\s+', ' ', sentence).strip()
    sentence = re.sub(" s ",' ',sentence)
    if sentence[0:5] == 'says ':
        sentence = sentence[5:]
    sentence = ' '.join([w for w in sentence.split() if len(w)>1])
    return sentence

def preprocess_df(df):
    for index in df.index:
        df.at[index,'text'] = preprocess_claim(df.at[index,'text'])
        rating = df.at[index,'rating']
        if rating == 'FALSE':
            df.at[index,'rating'] = 0
        if rating == 'TRUE':
            df.at[index,'rating'] = 1
        if rating == 'MIXTURE':
            df.at[index,'rating'] = 2
    return df

In [55]:
df = preprocess_df(df)

### DF Copies

In [56]:
#All
df1 = df.copy()

#True-False
df2 = df1.copy()
df2 = df2.loc[(df2['rating'] == 0) | (df2['rating'] == 1)]
df2 = df2.reset_index(drop=True)

#Mixed
df3 = df1.copy()
for index in df3.index:
    rating = df3.at[index,'rating']
    if rating == 1:
        df3.at[index,'rating'] = 0
    if rating == 2:
        df3.at[index,'rating'] = 1

In [57]:
dfs = dict()
dfs['3 Classes'] = df1
dfs['2 Classes'] = df2
dfs['2 Mixed Classes'] = df3

### Classify without authors

In [58]:
ngram = (1,3)
mindf = 2

def vectorize(df):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    vectors = vectorizer.fit_transform(df['text']).toarray()
    return vectors

In [59]:
def classify(df, key):
    X = vectorize(df)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (key,score.mean(),score.std()))
    return score.mean()

In [60]:
#Classify without authors
iterations = 10

for key, df in dfs.items():
    scores = list()
    for i in range(iterations):
        scores.append(classify(df,key))

    avg = sum(scores)/len(scores)
    print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes - Mean accuracy: 0.552110, Deviation: 0.039724
3 Classes - Mean accuracy: 0.545464, Deviation: 0.036910
3 Classes - Mean accuracy: 0.550194, Deviation: 0.028658
3 Classes - Mean accuracy: 0.544852, Deviation: 0.038287
3 Classes - Mean accuracy: 0.544080, Deviation: 0.045615
3 Classes - Mean accuracy: 0.540526, Deviation: 0.039730
3 Classes - Mean accuracy: 0.557001, Deviation: 0.041782
3 Classes - Mean accuracy: 0.541446, Deviation: 0.033573
3 Classes - Mean accuracy: 0.533182, Deviation: 0.032339
3 Classes - Mean accuracy: 0.550571, Deviation: 0.036797
Average of 10 runs: 0.545943

2 Classes - Mean accuracy: 0.671988, Deviation: 0.056670
2 Classes - Mean accuracy: 0.651270, Deviation: 0.031353
2 Classes - Mean accuracy: 0.664332, Deviation: 0.061324
2 Classes - Mean accuracy: 0.677939, Deviation: 0.047989
2 Classes - Mean accuracy: 0.668868, Deviation: 0.042885
2 Classes - Mean accuracy: 0.670682, Deviation: 0.048175
2 Classes - Mean accuracy: 0.661139, Deviation: 0.049367
2

### Classify with authors

In [61]:
def vectorize2(df):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    X = vectorizer.fit_transform(df['text']).toarray()
    df_vectors = pd.DataFrame(X, columns = vectorizer.get_feature_names())
    df_vectors['person'] = 0
    df_vectors['democrat'] = 0
    df_vectors['republican'] = 0
    df_vectors['political'] = 0
    df_vectors['journalist'] = 0
    df_vectors['organization'] = 0
    
    for i in df.index:
        if df.at[i,'type'] == 'Person':
            df_vectors.at[i,'person'] = 1
        if df.at[i,'type'] == 'Democrat':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'democrat'] = 1
        if df.at[i,'type'] == 'Republican':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'republican'] = 1
        if df.at[i,'type'] == 'Political':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'political'] = 1
        if df.at[i,'type'] == 'Journalist':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'journalist'] = 1
        if df.at[i,'type'] == 'Organization':
            df_vectors.at[i,'organization'] = 1
    return df_vectors.to_numpy()

In [62]:
def classify2(df, key):
    X = vectorize2(df)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (('%s with authors'%key),score.mean(),score.std()))
    return score.mean()

In [63]:
#Classify with authors
iterations = 10

for key, df in dfs.items():
    scores = list()
    for i in range(iterations):
        scores.append(classify2(df,key))

    avg = sum(scores)/len(scores)
    print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes with authors - Mean accuracy: 0.561039, Deviation: 0.039062
3 Classes with authors - Mean accuracy: 0.559507, Deviation: 0.029582
3 Classes with authors - Mean accuracy: 0.552909, Deviation: 0.018268
3 Classes with authors - Mean accuracy: 0.560979, Deviation: 0.046072
3 Classes with authors - Mean accuracy: 0.559171, Deviation: 0.043304
3 Classes with authors - Mean accuracy: 0.558661, Deviation: 0.027131
3 Classes with authors - Mean accuracy: 0.566690, Deviation: 0.021002
3 Classes with authors - Mean accuracy: 0.556134, Deviation: 0.024517
3 Classes with authors - Mean accuracy: 0.552929, Deviation: 0.030491
3 Classes with authors - Mean accuracy: 0.557008, Deviation: 0.035970
Average of 10 runs: 0.558503

2 Classes with authors - Mean accuracy: 0.677939, Deviation: 0.071413
2 Classes with authors - Mean accuracy: 0.691909, Deviation: 0.062814
2 Classes with authors - Mean accuracy: 0.687990, Deviation: 0.057731
2 Classes with authors - Mean accuracy: 0.682039, Deviation:

### Author IDF

In [64]:
#Author IDF test
def get_iaf_dict(ngram,mindf,df):
    vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
    vectorizer.fit(df['text'])
    icfdict = {key:0 for key in vectorizer.get_feature_names()}
    classes = df["rating"].unique()

    dfcldict = {key:df.loc[df["rating"] == key] for key in classes}

    for key in icfdict.keys():
        for cl in classes:
            dfcl = dfcldict[cl]
            for claim in dfcl["text"]:
                text = ' ' + claim + ' '
                if text.find(' ' + key + ' ') >= 0:
                    icfdict[key] = icfdict[key] + 1
                    break
    return icfdict 

In [65]:
def classify_iaf(df, key1):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    iafdict = get_iaf_dict(ngram,mindf,df)
    vec = vectorizer.fit_transform(df['text']).toarray()
    
    for key,value in vectorizer.vocabulary_.items():
        vec[:,value] = vec[:,value] * math.log10(1 + iafdict[key])
    
    X = vec
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (key1,score.mean(),score.std()))
    return score.mean()

In [66]:
# Classify with IAF
iterations = 10

for key, df in dfs.items():
    scores = list()
    for i in range(iterations):
        scores.append(classify_iaf(df,key))
    avg = sum(scores)/len(scores)
    print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes - Mean accuracy: 0.546291, Deviation: 0.017980
3 Classes - Mean accuracy: 0.544504, Deviation: 0.022676
3 Classes - Mean accuracy: 0.540559, Deviation: 0.019115
3 Classes - Mean accuracy: 0.545390, Deviation: 0.025491
3 Classes - Mean accuracy: 0.542111, Deviation: 0.023913
3 Classes - Mean accuracy: 0.552869, Deviation: 0.015710
3 Classes - Mean accuracy: 0.540573, Deviation: 0.026896
3 Classes - Mean accuracy: 0.546217, Deviation: 0.024594
3 Classes - Mean accuracy: 0.545370, Deviation: 0.022037
3 Classes - Mean accuracy: 0.555341, Deviation: 0.016834
Average of 10 runs: 0.545923

2 Classes - Mean accuracy: 0.630443, Deviation: 0.027360
2 Classes - Mean accuracy: 0.619158, Deviation: 0.031507
2 Classes - Mean accuracy: 0.620936, Deviation: 0.037442
2 Classes - Mean accuracy: 0.638099, Deviation: 0.016486
2 Classes - Mean accuracy: 0.626742, Deviation: 0.030022
2 Classes - Mean accuracy: 0.626451, Deviation: 0.035316
2 Classes - Mean accuracy: 0.622714, Deviation: 0.028425
2