In [1]:
# Load contractions model
from pycontractions import Contractions

cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

  'See the migration notes for details: %s' % _MIGRATION_NOTES_URL


### Load Data

In [2]:
import pandas as pd
import numpy 

file1 = 'Data/c1.csv'
file2 = 'Data/c2.csv'
file3 = 'Data/c3.csv'
file4 = 'Data/c4.csv'

In [3]:
df1 = pd.read_csv(file1,sep=",")
df2 = pd.read_csv(file2,sep=",")
df3 = pd.read_csv(file3,sep=",")
dfall = pd.read_csv(file4,sep=",")

df1 = df1.loc[df1['rating'] != 'OTHER']
df2 = df2.loc[df2['rating'] != 'OTHER']
df3 = df3.loc[df3['rating'] != 'OTHER']
dfall = dfall.loc[dfall['rating'] != 'OTHER']

### Preprocess

In [4]:
import re

def preprocess_claim(sentence):
    sentence = sentence.lower().strip()
    translator = str.maketrans('’', "'", '')
    sentence = sentence.translate(translator)
    sentence = re.sub("u\.s\.","united states",sentence)
    sentence = list(cont.expand_texts([sentence],precise=True))[0]
    sentence = re.sub("[^a-zA-Z0-9_.’,]|(?<!\d)\.(?!\d)|(?<!\w)-(?!\w)|(?<!\d)\,(?!\d)",' ',sentence)
    sentence = re.sub(",",'',sentence)
    sentence = re.sub("\.",'',sentence)
    sentence = re.sub(" a ",' ',sentence)
    sentence = re.sub('\s+', ' ', sentence).strip()
    sentence = re.sub(" s ",' ',sentence)
    if sentence[0:5] == 'says ':
        sentence = sentence[5:]
    sentence = ' '.join([w for w in sentence.split() if len(w)>1])
    return sentence

def preprocess_df(df):
    for index in df.index:
        df.at[index,'text'] = preprocess_claim(df.at[index,'text'])
        rating = df.at[index,'rating']
        if rating == 'FALSE':
            df.at[index,'rating'] = 0
        if rating == 'TRUE':
            df.at[index,'rating'] = 1
        if rating == 'MIXTURE':
            df.at[index,'rating'] = 2
    return df

In [5]:
df1 = preprocess_df(df1)
df2 = preprocess_df(df2)
df3 = preprocess_df(df3)
dfall = preprocess_df(dfall)

### ITF

In [17]:
from sklearn.feature_extraction.text import CountVectorizer

def get_itf_dict(ngram,mindf,dfall,topics):
    vectorizer = CountVectorizer(ngram_range=ngram,min_df=2)
    vectorizer.fit(dfall['text'])
    itfdict = {key:0 for key in vectorizer.get_feature_names()}
    for key in itfdict.keys():
        for topic in topics:
            for claim in topic['text']:
                cl = ' ' + claim + ' '
                if cl.find(' ' + key + ' ') >= 0:
                    itfdict[key] = itfdict[key]+1
                    break                
    for key in itfdict.keys():
        itfdict[key] = len(topics) / itfdict[key]
    return itfdict

In [18]:
ngram = (1,3)
topics = [df1,df2,df3]

itfdict = get_itf_dict(ngram,2,dfall,topics)
# vectorizer = CountVectorizer(ngram_range=ngram,min_df=2)
# vectorizer.fit(dfall['text'])

# itfdict = {key:0 for key in vectorizer.get_feature_names()}

In [7]:
topics = [df1,df2,df3]

for key in itfdict.keys():
    for topic in topics:
        for claim in topic['text']:
            cl = ' ' + claim + ' '
            if cl.find(' ' + key + ' ') >= 0:
                itfdict[key] = itfdict[key]+1
                break
                
for key in itfdict.keys():
    itfdict[key] = len(topics) / itfdict[key]

In [8]:
itfdict

{'01': 3.0,
 '01 percent': 3.0,
 '02': 1.5,
 '09': 1.5,
 '10': 1.0,
 '10 billion': 1.0,
 '10 billion in': 3.0,
 '10 collective': 3.0,
 '10 collective bargaining': 3.0,
 '10 days': 3.0,
 '10 highest': 3.0,
 '10 in': 3.0,
 '10 in the': 3.0,
 '10 million': 3.0,
 '10 minimum': 3.0,
 '10 minimum wage': 3.0,
 '10 most': 1.5,
 '10 of': 3.0,
 '10 percent': 1.0,
 '10 percent for': 3.0,
 '10 percent in': 3.0,
 '10 percent of': 1.0,
 '10 public': 1.5,
 '10 school': 3.0,
 '10 states': 3.0,
 '10 the': 3.0,
 '10 times': 1.5,
 '10 years': 1.0,
 '10 years ago': 3.0,
 '10 years only': 3.0,
 '10 years our': 3.0,
 '10 years than': 1.5,
 '10 years we': 1.5,
 '100': 1.0,
 '100 and': 1.5,
 '100 billion': 1.0,
 '100 days': 3.0,
 '100 million': 1.5,
 '100 million in': 3.0,
 '100 million people': 1.5,
 '100 of': 1.5,
 '100 percent': 1.0,
 '100 percent of': 1.0,
 '100 to': 1.5,
 '100 years': 1.5,
 '1000': 1.0,
 '1000 american': 3.0,
 '1000 each': 1.5,
 '1000 people': 1.5,
 '1000 people day': 1.5,
 '1000 per': 3

### ICF

In [9]:
ngram = (1,3)
vectorizer = CountVectorizer(ngram_range=ngram,min_df=2)
vectorizer.fit(df1['text'])

icfdict = {key:0 for key in vectorizer.get_feature_names()}

In [10]:
classes = df1["rating"].unique()

dfcldict = {key:df1.loc[df1["rating"] == key] for key in classes}

for key in icfdict.keys():
    for cl in classes:
        dfcl = dfcldict[cl]
        for claim in dfcl["text"]:
            text = ' ' + claim + ' '
            if text.find(' ' + key + ' ') >= 0:
                icfdict[key] = icfdict[key] + 1
                break

In [11]:
icfdict

{'10': 3,
 '10 billion': 1,
 '10 days': 2,
 '10 percent': 1,
 '10 percent of': 1,
 '10 years': 2,
 '100': 3,
 '100 percent': 2,
 '100 to': 2,
 '100 years': 2,
 '1000': 2,
 '10000': 2,
 '100000': 3,
 '100000 jobs': 2,
 '11': 3,
 '12': 3,
 '120000': 1,
 '13': 1,
 '14': 2,
 '14 percent': 1,
 '15': 3,
 '15 percent': 2,
 '15 years': 1,
 '150': 2,
 '150000': 2,
 '16': 2,
 '16 percent': 2,
 '16 years': 1,
 '17': 2,
 '17 years': 1,
 '170': 2,
 '1761': 1,
 '18': 2,
 '1970s': 2,
 '1977': 1,
 '1980': 2,
 '1980 and': 2,
 '1994': 2,
 '20': 3,
 '20 percent': 2,
 '20 years': 2,
 '200': 2,
 '2000': 3,
 '20000': 2,
 '200000': 2,
 '200000 from': 2,
 '2003': 2,
 '2005': 2,
 '2008': 3,
 '2009': 2,
 '2009 to': 2,
 '2010': 2,
 '2011': 2,
 '2012': 2,
 '2013': 1,
 '2016': 2,
 '2017': 1,
 '2025': 2,
 '2030': 1,
 '21': 2,
 '24': 2,
 '25': 3,
 '25 percent': 1,
 '25 percent of': 1,
 '250': 1,
 '27': 2,
 '27 million': 2,
 '30': 2,
 '30 cents': 2,
 '30 percent': 2,
 '30 years': 2,
 '30 years of': 1,
 '300': 2,
 '30

### ITF * ICF

In [12]:
itficfdict = { key:itfdict[key] * icfdict[key] for key in icfdict.keys() }

### ITF * ICF Vectors

In [13]:
worddict = itficfdict.keys()

def computeITFICFVector(claim):
    itficfVector = [0.0] * len(worddict)
    for i, word in enumerate(worddict):
        w = ' ' + word + ' '
        if w in ' ' + claim + ' ':
            itficfVector[i] = itficfdict[word]
    return itficfVector

vectors = [computeITFICFVector(claim) for claim in df1['text']]
vectors = np.array(vectors)
#Vectors to DF
#pd.DataFrame(vectors, columns=itficfdict.keys())

### Classification

In [14]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [16]:
X = vectors
y = df1["rating"]
y = y.astype('int')
clf = LinearSVC(random_state=0, tol=1e-5)
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
print('Mean accuracy: %f, Deviation: %f' % (score.mean(),score.std()))

Mean accuracy: 0.410909, Deviation: 0.053600
