In [1]:
# Load contractions model
from pycontractions import Contractions

cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

In [2]:
import pandas as pd
import numpy as np
import csv
import re

from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

### Load Data

In [3]:
file = 'Data/c3.csv'
df = pd.read_csv(file,sep=",")
df = df.loc[df['rating'] != 'OTHER']

In [4]:
authors_dict = dict()

with open('Data/authors_classified.csv', 'r', newline='') as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row.
    for row in reader:
        authors_dict[row[0]] = row[1]

In [5]:
df.shape
#990 with snopes

(1410, 7)

In [6]:
df = df.loc[df['organization'] != 'snopes']
df.shape
#949 without snopes

(1355, 7)

In [7]:
ix=[i for i in df.index if ((df.at[i,'author'] in authors_dict.keys()))]
df = df.loc[ix]
df.shape
#827 claims with classified authors

(1221, 7)

In [8]:
df = df.reset_index(drop=True)

### Add author type to df

In [9]:
def add_type_column(df):
    df['type'] = ''
    for i in df.index:
        df.at[i,'type'] = authors_dict[df.at[i,'author']]

In [10]:
add_type_column(df)

### Preprocess DF

In [11]:
def preprocess_claim(sentence):
    sentence = sentence.lower().strip()
    translator = str.maketrans('’', "'", '')
    sentence = sentence.translate(translator)
    sentence = re.sub("u\.s\.","united states",sentence)
    sentence = list(cont.expand_texts([sentence],precise=True))[0]
    sentence = re.sub("[^a-zA-Z0-9_.’,]|(?<!\d)\.(?!\d)|(?<!\w)-(?!\w)|(?<!\d)\,(?!\d)",' ',sentence)
    sentence = re.sub(",",'',sentence)
    sentence = re.sub("\.",'',sentence)
    sentence = re.sub(" a ",' ',sentence)
    sentence = re.sub('\s+', ' ', sentence).strip()
    sentence = re.sub(" s ",' ',sentence)
    if sentence[0:5] == 'says ':
        sentence = sentence[5:]
    sentence = ' '.join([w for w in sentence.split() if len(w)>1])
    return sentence

def preprocess_df(df):
    for index in df.index:
        df.at[index,'text'] = preprocess_claim(df.at[index,'text'])
        rating = df.at[index,'rating']
        if rating == 'FALSE':
            df.at[index,'rating'] = 0
        if rating == 'TRUE':
            df.at[index,'rating'] = 1
        if rating == 'MIXTURE':
            df.at[index,'rating'] = 2
    return df

In [12]:
df = preprocess_df(df)

### DF Copies

In [13]:
#All
df1 = df.copy()

#True-False
df2 = df1.copy()
df2 = df2.loc[(df2['rating'] == 0) | (df2['rating'] == 1)]
df2 = df2.reset_index(drop=True)

#Mixed
df3 = df1.copy()
for index in df3.index:
    rating = df3.at[index,'rating']
    if rating == 1:
        df3.at[index,'rating'] = 0
    if rating == 2:
        df3.at[index,'rating'] = 1

In [14]:
dfs = dict()
dfs['3 Classes'] = df1
dfs['2 Classes'] = df2
dfs['2 Mixed Classes'] = df3

### Classify without authors

In [15]:
ngram = (1,3)
mindf = 2

def vectorize(df):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    vectors = vectorizer.fit_transform(df['text']).toarray()
    return vectors

In [16]:
def classify(df, key):
    X = vectorize(df)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (key,score.mean(),score.std()))
    return score.mean()

In [17]:
#Classify without authors
iterations = 10

for key, df in dfs.items():
    scores = list()
    for i in range(iterations):
        scores.append(classify(df,key))

    avg = sum(scores)/len(scores)
    print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes - Mean accuracy: 0.542185, Deviation: 0.032067
3 Classes - Mean accuracy: 0.552002, Deviation: 0.038817
3 Classes - Mean accuracy: 0.551296, Deviation: 0.025852
3 Classes - Mean accuracy: 0.546149, Deviation: 0.038567
3 Classes - Mean accuracy: 0.560253, Deviation: 0.032905
3 Classes - Mean accuracy: 0.563344, Deviation: 0.033419
3 Classes - Mean accuracy: 0.547776, Deviation: 0.047482
3 Classes - Mean accuracy: 0.543744, Deviation: 0.027199
3 Classes - Mean accuracy: 0.546190, Deviation: 0.032115
3 Classes - Mean accuracy: 0.550335, Deviation: 0.028959
Average of 10 runs: 0.550327

2 Classes - Mean accuracy: 0.649456, Deviation: 0.040649
2 Classes - Mean accuracy: 0.649601, Deviation: 0.051296
2 Classes - Mean accuracy: 0.666473, Deviation: 0.079761
2 Classes - Mean accuracy: 0.653374, Deviation: 0.051572
2 Classes - Mean accuracy: 0.659325, Deviation: 0.049521
2 Classes - Mean accuracy: 0.672170, Deviation: 0.047957
2 Classes - Mean accuracy: 0.666691, Deviation: 0.038085
2

### Classify with authors

In [18]:
def vectorize2(df):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    X = vectorizer.fit_transform(df['text']).toarray()
    df_vectors = pd.DataFrame(X, columns = vectorizer.get_feature_names())
    df_vectors['person'] = 0
    df_vectors['democrat'] = 0
    df_vectors['republican'] = 0
    df_vectors['political'] = 0
    df_vectors['journalist'] = 0
    df_vectors['organization'] = 0
    
    for i in df.index:
        if df.at[i,'type'] == 'Person':
            df_vectors.at[i,'person'] = 1
        if df.at[i,'type'] == 'Democrat':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'democrat'] = 1
        if df.at[i,'type'] == 'Republican':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'republican'] = 1
        if df.at[i,'type'] == 'Political':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'political'] = 1
        if df.at[i,'type'] == 'Journalist':
            df_vectors.at[i,'person'] = 1
            df_vectors.at[i,'journalist'] = 1
        if df.at[i,'type'] == 'Organization':
            df_vectors.at[i,'organization'] = 1
    return df_vectors.to_numpy()

In [19]:
def classify2(df, key):
    X = vectorize2(df)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s - Mean accuracy: %f, Deviation: %f' % (('%s with authors'%key),score.mean(),score.std()))
    return score.mean()

In [20]:
#Classify with authors
iterations = 10

for key, df in dfs.items():
    scores = list()
    for i in range(iterations):
        scores.append(classify2(df,key))

    avg = sum(scores)/len(scores)
    print("Average of %d runs: %f\n" % (iterations,avg))

3 Classes with authors - Mean accuracy: 0.577461, Deviation: 0.034803
3 Classes with authors - Mean accuracy: 0.548750, Deviation: 0.025128
3 Classes with authors - Mean accuracy: 0.554394, Deviation: 0.032575
3 Classes with authors - Mean accuracy: 0.552076, Deviation: 0.041307
3 Classes with authors - Mean accuracy: 0.568416, Deviation: 0.022063
3 Classes with authors - Mean accuracy: 0.556900, Deviation: 0.024237
3 Classes with authors - Mean accuracy: 0.557733, Deviation: 0.040591
3 Classes with authors - Mean accuracy: 0.561818, Deviation: 0.043977
3 Classes with authors - Mean accuracy: 0.551921, Deviation: 0.023388
3 Classes with authors - Mean accuracy: 0.562558, Deviation: 0.040100
Average of 10 runs: 0.559203

2 Classes with authors - Mean accuracy: 0.693469, Deviation: 0.070502
2 Classes with authors - Mean accuracy: 0.668433, Deviation: 0.070273
2 Classes with authors - Mean accuracy: 0.697279, Deviation: 0.060141
2 Classes with authors - Mean accuracy: 0.687591, Deviation: