In [1]:
# Load contractions model
from pycontractions import Contractions

cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

In [2]:
import pandas as pd
import numpy as np
import csv
import re
import math

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

### Load Data

In [3]:
file = 'Data/c4.csv'
df = pd.read_csv(file,sep=",")
df = df.loc[df['rating'] != 'OTHER']

In [4]:
authors_dict = dict()

with open('Data/authors_classified.csv', 'r', newline='') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row.
    for row in reader:
        authors_dict[row[0]] = row[1]

In [5]:
df.shape
#990 with snopes

(6179, 7)

In [6]:
df = df.loc[df['organization'] != 'snopes']
df.shape
#949 without snopes

(6053, 7)

In [7]:
ix=[i for i in df.index if ((df.at[i,'author'] in authors_dict.keys()))]
df = df.loc[ix]
df.shape
#827 claims with classified authors

(5330, 7)

In [8]:
df = df.reset_index(drop=True)

### Add author type to df

In [9]:
def add_type_column(df):
    df['type'] = ''
    for i in df.index:
        df.at[i,'type'] = authors_dict[df.at[i,'author']]

In [10]:
add_type_column(df)

### Preprocess DF

In [11]:
def preprocess_claim(sentence):
    sentence = sentence.lower().strip()
    translator = str.maketrans('’', "'", '')
    sentence = sentence.translate(translator)
    sentence = re.sub("u\.s\.","united states",sentence)
    sentence = list(cont.expand_texts([sentence],precise=True))[0]
    sentence = re.sub("[^a-zA-Z0-9_.’,]|(?<!\d)\.(?!\d)|(?<!\w)-(?!\w)|(?<!\d)\,(?!\d)",' ',sentence)
    sentence = re.sub(",",'',sentence)
    sentence = re.sub("\.",'',sentence)
    sentence = re.sub(" a ",' ',sentence)
    sentence = re.sub('\s+', ' ', sentence).strip()
    sentence = re.sub(" s ",' ',sentence)
    if sentence[0:5] == 'says ':
        sentence = sentence[5:]
    sentence = ' '.join([w for w in sentence.split() if len(w)>1])
    return sentence

def preprocess_df(df):
    for index in df.index:
        df.at[index,'text'] = preprocess_claim(df.at[index,'text'])
        rating = df.at[index,'rating']
        if rating == 'FALSE':
            df.at[index,'rating'] = 0
        if rating == 'TRUE':
            df.at[index,'rating'] = 1
        if rating == 'MIXTURE':
            df.at[index,'rating'] = 2
    return df

In [12]:
df = preprocess_df(df)

### DF Copies

In [13]:
#All
df1 = df.copy()

#True-False
df2 = df1.copy()
df2 = df2.loc[(df2['rating'] == 0) | (df2['rating'] == 1)]
df2 = df2.reset_index(drop=True)

#Mixed
df3 = df1.copy()
for index in df3.index:
    rating = df3.at[index,'rating']
    if rating == 1:
        df3.at[index,'rating'] = 0
    if rating == 2:
        df3.at[index,'rating'] = 1

In [14]:
dfs = dict()
dfs['3 Classes'] = df1
dfs['2 Classes'] = df2
dfs['2 Mixed Classes'] = df3

### Classify without authors

In [15]:
ngram = (1,3)
mindf = 2

def vectorize(df):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    vectors = vectorizer.fit_transform(df['text']).toarray()
    return vectors

In [16]:
def classify(df, key):
    X = vectorize(df)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (key,score.mean(),score.std()))
    return score.mean()

In [17]:
#Classify without authors
iterations = 10

for key, df in dfs.items():
    scores = list()
    for i in range(iterations):
        scores.append(classify(df,key))

    avg = sum(scores)/len(scores)
    print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes - Mean accuracy: 0.561541, Deviation: 0.010503
3 Classes - Mean accuracy: 0.565667, Deviation: 0.010233
3 Classes - Mean accuracy: 0.558533, Deviation: 0.017906
3 Classes - Mean accuracy: 0.557783, Deviation: 0.010593
3 Classes - Mean accuracy: 0.564722, Deviation: 0.013668
3 Classes - Mean accuracy: 0.566411, Deviation: 0.009053
3 Classes - Mean accuracy: 0.558539, Deviation: 0.010852
3 Classes - Mean accuracy: 0.556288, Deviation: 0.011443
3 Classes - Mean accuracy: 0.557604, Deviation: 0.015704
3 Classes - Mean accuracy: 0.562853, Deviation: 0.011597
Average of 10 runs: 0.560994

2 Classes - Mean accuracy: 0.638865, Deviation: 0.027311
2 Classes - Mean accuracy: 0.637437, Deviation: 0.018688
2 Classes - Mean accuracy: 0.642677, Deviation: 0.031628
2 Classes - Mean accuracy: 0.639829, Deviation: 0.030270
2 Classes - Mean accuracy: 0.640287, Deviation: 0.025044
2 Classes - Mean accuracy: 0.643183, Deviation: 0.032063
2 Classes - Mean accuracy: 0.646965, Deviation: 0.029659
2

### Classify with authors

In [18]:
def vectorize2(df):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    X = vectorizer.fit_transform(df['text']).toarray()
    df_vectors = pd.DataFrame(X, columns = vectorizer.get_feature_names())
    df_vectors['person'] = 0
    df_vectors['democrat'] = 0
    df_vectors['republican'] = 0
    df_vectors['political'] = 0
    df_vectors['journalist'] = 0
    df_vectors['organization'] = 0
    
    for i in df.index:
        if df.at[i,'type'] == 'Person':
            df_vectors.at[i,'person'] = 1
        if df.at[i,'type'] == 'Democrat':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'democrat'] = 1
        if df.at[i,'type'] == 'Republican':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'republican'] = 1
        if df.at[i,'type'] == 'Political':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'political'] = 1
        if df.at[i,'type'] == 'Journalist':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'journalist'] = 1
        if df.at[i,'type'] == 'Organization':
            df_vectors.at[i,'organization'] = 1
    return df_vectors.to_numpy()

In [19]:
def classify2(df, key):
    X = vectorize2(df)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (('%s with authors'%key),score.mean(),score.std()))
    return score.mean()

In [20]:
#Classify with authors
iterations = 10

for key, df in dfs.items():
    scores = list()
    for i in range(iterations):
        scores.append(classify2(df,key))

    avg = sum(scores)/len(scores)
    print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes with authors - Mean accuracy: 0.559849, Deviation: 0.011154
3 Classes with authors - Mean accuracy: 0.570345, Deviation: 0.020037
3 Classes with authors - Mean accuracy: 0.569419, Deviation: 0.010230
3 Classes with authors - Mean accuracy: 0.565483, Deviation: 0.009810
3 Classes with authors - Mean accuracy: 0.569607, Deviation: 0.017298
3 Classes with authors - Mean accuracy: 0.559670, Deviation: 0.022957
3 Classes with authors - Mean accuracy: 0.563038, Deviation: 0.020898
3 Classes with authors - Mean accuracy: 0.561169, Deviation: 0.017635
3 Classes with authors - Mean accuracy: 0.571291, Deviation: 0.008420
3 Classes with authors - Mean accuracy: 0.565484, Deviation: 0.011276
Average of 10 runs: 0.565536

2 Classes with authors - Mean accuracy: 0.657927, Deviation: 0.027818
2 Classes with authors - Mean accuracy: 0.643167, Deviation: 0.032774
2 Classes with authors - Mean accuracy: 0.648863, Deviation: 0.024182
2 Classes with authors - Mean accuracy: 0.653627, Deviation:

### Author IDF

In [21]:
#Author IDF test
def get_iaf_dict(ngram,mindf,df):
    vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
    vectorizer.fit(df['text'])
    icfdict = {key:0 for key in vectorizer.get_feature_names()}
    classes = df["rating"].unique()

    dfcldict = {key:df.loc[df["rating"] == key] for key in classes}

    for key in icfdict.keys():
        for cl in classes:
            dfcl = dfcldict[cl]
            for claim in dfcl["text"]:
                text = ' ' + claim + ' '
                if text.find(' ' + key + ' ') >= 0:
                    icfdict[key] = icfdict[key] + 1
                    break
    return icfdict 

In [22]:
def classify_iaf(df, key1):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    iafdict = get_iaf_dict(ngram,mindf,df)
    vec = vectorizer.fit_transform(df['text']).toarray()
    
    for key,value in vectorizer.vocabulary_.items():
        vec[:,value] = vec[:,value] * math.log10(1 + iafdict[key])
    
    X = vec
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (key1,score.mean(),score.std()))
    return score.mean()

In [23]:
# Classify with IAF
iterations = 10

for key, df in dfs.items():
    scores = list()
    for i in range(iterations):
        scores.append(classify_iaf(df,key))
    avg = sum(scores)/len(scores)
    print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes - Mean accuracy: 0.584241, Deviation: 0.009446
3 Classes - Mean accuracy: 0.583865, Deviation: 0.006925
3 Classes - Mean accuracy: 0.586677, Deviation: 0.014332
3 Classes - Mean accuracy: 0.585371, Deviation: 0.007343
3 Classes - Mean accuracy: 0.583304, Deviation: 0.007649
3 Classes - Mean accuracy: 0.584803, Deviation: 0.012200
3 Classes - Mean accuracy: 0.583491, Deviation: 0.010827
3 Classes - Mean accuracy: 0.586304, Deviation: 0.012875
3 Classes - Mean accuracy: 0.581234, Deviation: 0.010110
3 Classes - Mean accuracy: 0.585744, Deviation: 0.010253
Average of 10 runs: 0.584504

2 Classes - Mean accuracy: 0.614087, Deviation: 0.035288
2 Classes - Mean accuracy: 0.619337, Deviation: 0.021753
2 Classes - Mean accuracy: 0.607922, Deviation: 0.027423
2 Classes - Mean accuracy: 0.625054, Deviation: 0.029815
2 Classes - Mean accuracy: 0.620285, Deviation: 0.040342
2 Classes - Mean accuracy: 0.619337, Deviation: 0.022772
2 Classes - Mean accuracy: 0.623614, Deviation: 0.028088
2