In [60]:
# Load contractions model
from pycontractions import Contractions

cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

In [61]:
import pandas as pd
import numpy as np
import csv
import re
import math

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import mutual_info_classif
from sklearn import metrics

### Load Data

In [96]:
cluster_name = 'c3'
file = 'Data/'+cluster_name+'.csv'
df = pd.read_csv(file,sep=",")
df = df.loc[df['rating'] != 'OTHER']

### Preprocess

In [97]:
def preprocess_claim(sentence):
    sentence = sentence.lower().strip()
    translator = str.maketrans('’', "'", '')
    sentence = sentence.translate(translator)
    sentence = re.sub("u\.s\.","united states",sentence)
    sentence = list(cont.expand_texts([sentence],precise=True))[0]
    sentence = re.sub("[^a-zA-Z0-9_.’,]|(?<!\d)\.(?!\d)|(?<!\w)-(?!\w)|(?<!\d)\,(?!\d)",' ',sentence)
    sentence = re.sub(",",'',sentence)
    sentence = re.sub("\.",'',sentence)
    sentence = re.sub(" a ",' ',sentence)
    sentence = re.sub('\s+', ' ', sentence).strip()
    sentence = re.sub(" s ",' ',sentence)
    if sentence[0:5] == 'says ':
        sentence = sentence[5:]
    sentence = ' '.join([w for w in sentence.split() if len(w)>1])
    return sentence

def preprocess_df(df):
    for index in df.index:
        df.at[index,'text'] = preprocess_claim(df.at[index,'text'])
        rating = df.at[index,'rating']
        if rating == 'FALSE':
            df.at[index,'rating'] = 0
        if rating == 'TRUE':
            df.at[index,'rating'] = 1
        if rating == 'MIXTURE':
            df.at[index,'rating'] = 2
    return df

In [98]:
df = preprocess_df(df)

### Mutual Information HMIN232M Formula

In [65]:
def get_mi_dict(ngram,mindf,c,df):
    #Get features
    vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
    vectorizer.fit(df['text'])
    
    #Get A,B,C dicts
    A_dict = {key:0 for key in vectorizer.get_feature_names()}
    B_dict = {key:0 for key in vectorizer.get_feature_names()}
    C_dict = {key:0 for key in vectorizer.get_feature_names()}
    for key in vectorizer.get_feature_names():
        for i in df.index:
            claim = df.at[i,'text']
            rating = df.at[i,'rating']
            claim = ' ' + claim + ' '
            if claim.find(' ' + key + ' ') >= 0:
                if rating == c:
                    A_dict[key] = A_dict[key] + 1
                    continue
                B_dict[key] = B_dict[key] + 1
                continue
            if rating == c:
                C_dict[key] = C_dict[key] + 1
                
    #Get MI dict
    MI_dict = dict()
    for key in vectorizer.get_feature_names():
        if A_dict[key] > 0:
            MI_dict[key] = math.log10((A_dict[key] * df.shape[0]) / ((A_dict[key] + C_dict[key]) * (A_dict[key] + B_dict[key])))
        else:
            MI_dict[key] = 0
    
    #Sort and return MI dict
    sorted_dict = sorted(MI_dict.items(), key=lambda kv: kv[1], reverse=True)
    return sorted_dict

### Mutual Information Python Implementation

In [66]:
def get_mi_dict_sklearn(ngram,mindf,df,tfidf=False):
    if tfidf == False:
        vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
    else:
        vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    X = vectorizer.fit_transform(df['text']).toarray()
    y = df['rating']
    features = vectorizer.get_feature_names()

    mi_dict = dict()
    for i in range(len(features)):
        mi_dict[features[i]] = metrics.mutual_info_score(X[:,i],y)
    
    sorted_dict = sorted(mi_dict.items(), key=lambda kv: kv[1], reverse=True)
    return sorted_dict

### Get and save results

In [67]:
def save_mi(filename,mi,n):
    rows = list()
    header = ["feature","mi_score"]
    rows.append(header)

    for i in range(n):
        feature = mi[i][0]
        score = mi[i][1]
        row = [feature, score]
        rows.append(row)    

    with open('Data/'+filename,'w',newline='',encoding="utf-8") as writeFile:
        writer = csv.writer(writeFile,delimiter=';')
        writer.writerows(rows)
        writeFile.close()

In [68]:
ngram = (1,3)
mindf = 2

In [69]:
mi = list()

for c in range(len(df['rating'].unique())):
  mi.append(get_mi_dict(ngram,mindf,c,df))

In [70]:
for i in range(len(mi)):
    suffix = '_mi_class_%d.csv' % i
    filename = cluster_name + suffix
    save_mi(filename,mi[i],500)

In [71]:
mi = get_mi_dict_sklearn(ngram,mindf,df,tfidf=False)

suffix = '_mi_tf_sklearn.csv'
filename = cluster_name + suffix
save_mi(filename,mi,500)

In [72]:
mi = get_mi_dict_sklearn(ngram,mindf,df,tfidf=True)

suffix = '_mi_tfidf_sklearn.csv'
filename = cluster_name + suffix
save_mi(filename,mi,500)

### MI Authors 

In [99]:
# Read authors
authors_dict = dict()

with open('Data/authors_classified.csv', 'r', newline='', encoding="utf-8") as file:
    reader = csv.reader(file)
    next(reader)  # Skip header row.
    for row in reader:
        authors_dict[row[0]] = row[1]

In [100]:
# Delete unclassified rows
ix=[i for i in df.index if ((df.at[i,'author'] in authors_dict.keys()))]
df = df.loc[ix]
df = df.reset_index(drop=True)

In [101]:
def add_type_columns(df):
    df['person'] = 0
    df['democrat'] = 0
    df['republican'] = 0
    df['political'] = 0
    df['journalist'] = 0
    df['organization'] = 0
    for i in df.index:
        author_type = authors_dict[df.at[i,'author']]
        if author_type == 'Person':
            df.at[i,'person'] = 1
        if author_type == 'Democrat':
            df.at[i,'person'] = 1
            df.at[i,'democrat'] = 1
        if author_type == 'Republican':
            df.at[i,'person'] = 1
            df.at[i,'republican'] = 1
        if author_type == 'Political':
            df.at[i,'person'] = 1
            df.at[i,'political'] = 1
        if author_type == 'Journalist':
            df.at[i,'person'] = 1
            df.at[i,'journalist'] = 1
        if author_type == 'Organization':
            df.at[i,'organization'] = 1
    return df

In [102]:
df = add_type_columns(df)

In [103]:
# Get sklearn MI
mi = dict()
mi['person'] = metrics.mutual_info_score(df['person'].to_numpy(), df['rating'].to_numpy())
mi['democrat'] = metrics.mutual_info_score(df['democrat'].to_numpy(), df['rating'].to_numpy())
mi['republican'] = metrics.mutual_info_score(df['republican'].to_numpy(), df['rating'].to_numpy())
mi['political'] = metrics.mutual_info_score(df['political'].to_numpy(), df['rating'].to_numpy())
mi['journalist'] = metrics.mutual_info_score(df['journalist'].to_numpy(), df['rating'].to_numpy())
mi['organization'] = metrics.mutual_info_score(df['organization'].to_numpy(), df['rating'].to_numpy())

In [104]:
def save_author_mi(filename,mi):
    rows = list()
    header = ["feature","mi_score"]
    rows.append(header)

    for key in mi:
        feature = key
        score = mi[key]
        row = [feature, score]
        rows.append(row)    

    with open('Data/'+filename,'w',newline='',encoding="utf-8") as writeFile:
        writer = csv.writer(writeFile,delimiter=';')
        writer.writerows(rows)
        writeFile.close()

In [105]:
suffix = '_mi_author_sklearn.csv'
filename = cluster_name + suffix
save_author_mi(filename,mi)

### Mutual Information HMIN232M Formula

In [106]:
def get_mi(c,x,y):
    A = 0
    B = 0
    C = 0
    
    m = len(x)
    
    for i in range(m):
        if x[i] == 1:
            if y[i] == c:
                A = A + 1
                continue
            B = B + 1
            continue
        if y[i] == c:
            C = C + 1
    if A > 0:
        mi = math.log10((A * m) / ((A + C) * (A + B)))
    else:
        mi = 0
    
    return mi

In [107]:
mi_list = list()

for c in range(len(df['rating'].unique())):
    mi = dict()
    mi['person'] = get_mi(c,df['person'].to_numpy(), df['rating'].to_numpy())
    mi['democrat'] = get_mi(c,df['democrat'].to_numpy(), df['rating'].to_numpy())
    mi['republican'] = get_mi(c,df['republican'].to_numpy(), df['rating'].to_numpy())
    mi['political'] = get_mi(c,df['political'].to_numpy(), df['rating'].to_numpy())
    mi['journalist'] = get_mi(c,df['journalist'].to_numpy(), df['rating'].to_numpy())
    mi['organization'] = get_mi(c,df['organization'].to_numpy(), df['rating'].to_numpy())
    mi_list.append(mi)

In [108]:
for i in range(len(mi_list)):
    suffix = '_mi_author_class_%d.csv' % i
    filename = cluster_name + suffix
    save_author_mi(filename,mi_list[i])