In [1]:
# Load contractions model
from pycontractions import Contractions

cont = Contractions(api_key="glove-twitter-100")
cont.load_models()

### Load Data

In [2]:
import pandas as pd
import numpy as np
import math

file1 = 'Data/c1.csv'
file2 = 'Data/c2.csv'
file3 = 'Data/c3.csv'
file4 = 'Data/c4.csv'

In [3]:
df1 = pd.read_csv(file1,sep=",")
df2 = pd.read_csv(file2,sep=",")
df3 = pd.read_csv(file3,sep=",")
dfall = pd.read_csv(file4,sep=",")

df1 = df1.loc[df1['rating'] != 'OTHER']
df2 = df2.loc[df2['rating'] != 'OTHER']
df3 = df3.loc[df3['rating'] != 'OTHER']
dfall = dfall.loc[dfall['rating'] != 'OTHER']

### Preprocess

In [4]:
import re

def preprocess_claim(sentence):
    sentence = sentence.lower().strip()
    translator = str.maketrans('’', "'", '')
    sentence = sentence.translate(translator)
    sentence = re.sub("u\.s\.","united states",sentence)
    sentence = list(cont.expand_texts([sentence],precise=True))[0]
    sentence = re.sub("[^a-zA-Z0-9_.’,]|(?<!\d)\.(?!\d)|(?<!\w)-(?!\w)|(?<!\d)\,(?!\d)",' ',sentence)
    sentence = re.sub(",",'',sentence)
    sentence = re.sub("\.",'',sentence)
    sentence = re.sub(" a ",' ',sentence)
    sentence = re.sub('\s+', ' ', sentence).strip()
    sentence = re.sub(" s ",' ',sentence)
    if sentence[0:5] == 'says ':
        sentence = sentence[5:]
    sentence = ' '.join([w for w in sentence.split() if len(w)>1])
    return sentence

def preprocess_df(df):
    for index in df.index:
        df.at[index,'text'] = preprocess_claim(df.at[index,'text'])
        rating = df.at[index,'rating']
        if rating == 'FALSE':
            df.at[index,'rating'] = 0
        if rating == 'TRUE':
            df.at[index,'rating'] = 1
        if rating == 'MIXTURE':
            df.at[index,'rating'] = 2
    return df

In [5]:
df1 = preprocess_df(df1)
df2 = preprocess_df(df2)
df3 = preprocess_df(df3)
dfall = preprocess_df(dfall)

### ITF

In [6]:
from sklearn.feature_extraction.text import CountVectorizer

def get_itf_dict(ngram,mindf,dfall,topics):
    vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
    vectorizer.fit(dfall['text'])
    itfdict = {key:0 for key in vectorizer.get_feature_names()}
    for key in itfdict.keys():
        for topic in topics:
            for claim in topic['text']:
                cl = ' ' + claim + ' '
                if cl.find(' ' + key + ' ') >= 0:
                    itfdict[key] = itfdict[key]+1
                    break                
    for key in itfdict.keys():
        itfdict[key] = len(topics) / itfdict[key]
    return itfdict

### ICF

In [7]:
def get_icf_dict(ngram,mindf,df):
    vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
    vectorizer.fit(df['text'])
    icfdict = {key:0 for key in vectorizer.get_feature_names()}
    classes = df["rating"].unique()

    dfcldict = {key:df.loc[df["rating"] == key] for key in classes}

    for key in icfdict.keys():
        for cl in classes:
            dfcl = dfcldict[cl]
            for claim in dfcl["text"]:
                text = ' ' + claim + ' '
                if text.find(' ' + key + ' ') >= 0:
                    icfdict[key] = icfdict[key] + 1
                    break
    return icfdict 

### ITF * ICF

In [8]:
def get_itf_icf_dict(itfdict,icfdict):
    itficfdict = { key:itfdict[key] * icfdict[key] for key in icfdict.keys() }
    return itficfdict

### ITF * ICF Vectors

In [9]:
def computeITFICFVector(claim,itficfdict):
    worddict = itficfdict.keys()
    itficfVector = [0.0] * len(worddict)
    for i, word in enumerate(worddict):
        w = ' ' + word + ' '
        if w in ' ' + claim + ' ':
            itficfVector[i] = itficfdict[word]
    return itficfVector

def get_itf_icf_matrix(df,itficfdict):
    vectors = [computeITFICFVector(claim,itficfdict) for claim in df['text']]
    vectors = np.array(vectors)
    return vectors

### Classification ITF*ICF

In [10]:
def get_classes(df):
    df_top = df

    class1 = df_top.copy()
    class2 = df_top.copy()
    class2 = df_top.loc[(df_top['rating'] == 0) | (df_top['rating'] == 1)]
    class3 = df_top.copy()

    for index in class3.index:
        rating = class3.at[index,'rating']
        if rating == 1:
            class3.at[index,'rating'] = 0
        if rating == 2:
            class3.at[index,'rating'] = 1

    dfs = dict()
    dfs['3 Classes'] = class1
    dfs['2 Classes'] = class2
    dfs['2 Mixed Classes'] = class3
    return dfs

In [11]:
topics = dict()
topics['Topic 1'] = get_classes(df1)
topics['Topic 2'] = get_classes(df2)
topics['Topic 3'] = get_classes(df3)
topicall = get_classes(dfall)

In [12]:
from sklearn.svm import LinearSVC
from sklearn.model_selection import KFold
from sklearn.model_selection import StratifiedKFold
from sklearn.model_selection import cross_val_score

In [13]:
def classify(topics,topicall,topic,key):
    ngram = (1,3)
    mindf = 2
    dfs = [topics[t][key] for t in topics.keys()]
    dfall = topicall[key]
    
    itfdict = get_itf_dict(ngram,mindf,dfall,dfs)
    icfdict = get_icf_dict(ngram,mindf,topics[topic][key])
    itficfdict = get_itf_icf_dict(itfdict,icfdict)
    
    X = get_itf_icf_matrix(topics[topic][key], itficfdict)
    y = topics[topic][key]["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('%s_%s - Mean accuracy: %f, Deviation: %f' % (topic,key,score.mean(),score.std()))

In [14]:
# for topickey in topics.keys():
#     for classkey in dfs.keys():
#         classify(topics, topicall, topickey, classkey)

### Classification ITF tout le corpus ICF 3 Classes

In [15]:
# Get ITF
ngram = (1,3)
mindf = 2

dfs = [df1, df2, df3]
itfdict = get_itf_dict(ngram,mindf,dfall,dfs)

In [16]:
t = 0
for df in dfs:
    t += 1
    icfdict = get_icf_dict(ngram,mindf,df)
    itficfdict = get_itf_icf_dict(itfdict,icfdict)
    X = get_itf_icf_matrix(df, itficfdict)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('Topic %d - Mean accuracy: %f, Deviation: %f' % (t,score.mean(),score.std()))

Topic 1 - Mean accuracy: 0.411257, Deviation: 0.049673




Topic 2 - Mean accuracy: 0.448178, Deviation: 0.012026




Topic 3 - Mean accuracy: 0.432632, Deviation: 0.023061




### IDF * ICF * ITF

In [17]:
from sklearn.feature_extraction.text import TfidfVectorizer

def get_idf_dict(ngram,mindf,df):
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    vectorizer.fit(df['text'])
    return dict(zip(vectorizer.get_feature_names(), vectorizer.idf_))

def get_idf_itf_icf_dict(idfdict,itficfdict):
    itficfdict = { key:itficfdict[key] * idfdict[key] for key in itficfdict.keys() }
    return itficfdict

In [18]:
t = 0
for df in dfs:
    t += 1
    icf_dict = get_icf_dict(ngram,mindf,df)
    itf_icf_dict = get_itf_icf_dict(itfdict,icf_dict)
    idf_dict = get_idf_dict(ngram,mindf,df)
    idf_itf_icf_dict = get_idf_itf_icf_dict(idf_dict,itf_icf_dict)
    
    X = get_itf_icf_matrix(df, idf_itf_icf_dict)
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('Topic %d - Mean accuracy: %f, Deviation: %f' % (t,score.mean(),score.std()))

Topic 1 - Mean accuracy: 0.399236, Deviation: 0.052109




Topic 2 - Mean accuracy: 0.447187, Deviation: 0.017315




Topic 3 - Mean accuracy: 0.414222, Deviation: 0.029117




### TF IDF ICF ITF

In [19]:
A = np.random.rand(8, 5)
A

array([[0.46008491, 0.3363062 , 0.29056075, 0.21303715, 0.04661846],
       [0.41235523, 0.50155493, 0.75452317, 0.69850961, 0.89951913],
       [0.55417753, 0.64354765, 0.46978286, 0.61730783, 0.83814455],
       [0.55374268, 0.76548706, 0.21251609, 0.96989405, 0.02203122],
       [0.55087558, 0.80315892, 0.71977917, 0.86358738, 0.72687355],
       [0.54349763, 0.08499166, 0.36700619, 0.62839792, 0.14014116],
       [0.89664673, 0.53806868, 0.79791814, 0.79009112, 0.80068308],
       [0.16011373, 0.41907947, 0.52281495, 0.84998568, 0.32775986]])

In [20]:
A[:,1] = A[:,1] * 2
A

array([[0.46008491, 0.6726124 , 0.29056075, 0.21303715, 0.04661846],
       [0.41235523, 1.00310986, 0.75452317, 0.69850961, 0.89951913],
       [0.55417753, 1.28709531, 0.46978286, 0.61730783, 0.83814455],
       [0.55374268, 1.53097411, 0.21251609, 0.96989405, 0.02203122],
       [0.55087558, 1.60631785, 0.71977917, 0.86358738, 0.72687355],
       [0.54349763, 0.16998332, 0.36700619, 0.62839792, 0.14014116],
       [0.89664673, 1.07613737, 0.79791814, 0.79009112, 0.80068308],
       [0.16011373, 0.83815894, 0.52281495, 0.84998568, 0.32775986]])

In [21]:
icfdict = get_icf_dict(ngram,mindf,df1)
itficfdict = get_itf_icf_dict(itfdict,icfdict)

vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
vec = vectorizer.fit_transform(df1['text']).toarray()

In [22]:
for key,value in vectorizer.vocabulary_.items():
    vec[:,value] *= itficfdict[key]

In [23]:
X = vec
y = df1["rating"]
y = y.astype('int')
clf = LinearSVC(random_state=0, tol=1e-5)
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
print('Mean accuracy: %f, Deviation: %f' % (score.mean(),score.std()))

Mean accuracy: 0.407142, Deviation: 0.035457


### PROPRE ?

### TF ICF

In [24]:
ngram = (1,3)
mindf = 2

dfs = [df1, df2, df3]
itfdict = get_itf_dict(ngram,mindf,dfall,dfs)

In [25]:
t = 0
for df in dfs:
    t += 1
    vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
    icfdict = get_icf_dict(ngram,mindf,df)
    vec = vectorizer.fit_transform(df['text']).toarray()
    for key,value in vectorizer.vocabulary_.items():
        vec[:,value] = vec[:,value] * math.log10(1 + icfdict[key])
    
    X = vec
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('Topic %d - Mean accuracy: %f, Deviation: %f' % (t,score.mean(),score.std()))

Topic 1 - Mean accuracy: 0.539307, Deviation: 0.023902
Topic 2 - Mean accuracy: 0.581900, Deviation: 0.009128
Topic 3 - Mean accuracy: 0.526916, Deviation: 0.010182


### TF ITF

In [26]:
t = 0
for df in dfs:
    t += 1
    vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
    vec = vectorizer.fit_transform(df['text']).toarray()
    for key,value in vectorizer.vocabulary_.items():
        vec[:,value] = vec[:,value] * math.log10(1 + itfdict[key])
    
    X = vec
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('Topic %d - Mean accuracy: %f, Deviation: %f' % (t,score.mean(),score.std()))

Topic 1 - Mean accuracy: 0.542388, Deviation: 0.011594
Topic 2 - Mean accuracy: 0.602291, Deviation: 0.002767
Topic 3 - Mean accuracy: 0.541851, Deviation: 0.006148


### TF ICF ITF

In [27]:
t = 0
for df in dfs:
    t += 1
    vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
    icfdict = get_icf_dict(ngram,mindf,df)
    vec = vectorizer.fit_transform(df['text']).toarray()
    for key,value in vectorizer.vocabulary_.items():
        vec[:,value] = vec[:,value] * math.log10(1 + icfdict[key]) * math.log10(1 + itfdict[key])
    
    X = vec
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('Topic %d - Mean accuracy: %f, Deviation: %f' % (t,score.mean(),score.std()))

Topic 1 - Mean accuracy: 0.546500, Deviation: 0.004163
Topic 2 - Mean accuracy: 0.604526, Deviation: 0.001088
Topic 3 - Mean accuracy: 0.541142, Deviation: 0.002470


### TF IDF ICF ITF

In [28]:
t = 0
for df in dfs:
    t += 1
    vectorizer = TfidfVectorizer(ngram_range=ngram,min_df=mindf)
    icfdict = get_icf_dict(ngram,mindf,df)
    vec = vectorizer.fit_transform(df['text']).toarray()
    for key,value in vectorizer.vocabulary_.items():
        vec[:,value] = vec[:,value] * math.log10(1 + icfdict[key]) * math.log10(1 + itfdict[key])
    
    X = vec
    y = df["rating"]
    y = y.astype('int')
    clf = LinearSVC(random_state=0, tol=1e-5)
    k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
    score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
    print('Topic %d - Mean accuracy: %f, Deviation: %f' % (t,score.mean(),score.std()))

Topic 1 - Mean accuracy: 0.546500, Deviation: 0.004163
Topic 2 - Mean accuracy: 0.608007, Deviation: 0.002604
Topic 3 - Mean accuracy: 0.541846, Deviation: 0.003483


### All data classification

In [33]:
df = dfall

In [34]:
#TF ITF
vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
vec = vectorizer.fit_transform(df['text']).toarray()
for key,value in vectorizer.vocabulary_.items():
    vec[:,value] = vec[:,value] * math.log10(1 + itfdict[key])

X = vec
y = df["rating"]
y = y.astype('int')
clf = LinearSVC(random_state=0, tol=1e-5)
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
print('All - Mean accuracy: %f, Deviation: %f' % (score.mean(),score.std()))

All - Mean accuracy: 0.580516, Deviation: 0.002139


In [35]:
#TF ICF
vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
icfdict = get_icf_dict(ngram,mindf,df)
vec = vectorizer.fit_transform(df['text']).toarray()
for key,value in vectorizer.vocabulary_.items():
    vec[:,value] = vec[:,value] * math.log10(1 + icfdict[key])

X = vec
y = df["rating"]
y = y.astype('int')
clf = LinearSVC(random_state=0, tol=1e-5)
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
print('All - Mean accuracy: %f, Deviation: %f' % (score.mean(),score.std()))

All - Mean accuracy: 0.558174, Deviation: 0.011616


In [36]:
#TF ICF ITF
vectorizer = CountVectorizer(ngram_range=ngram,min_df=mindf)
icfdict = get_icf_dict(ngram,mindf,df)
vec = vectorizer.fit_transform(df['text']).toarray()
for key,value in vectorizer.vocabulary_.items():
    vec[:,value] = vec[:,value] * math.log10(1 + icfdict[key]) * math.log10(1 + itfdict[key])

X = vec
y = df["rating"]
y = y.astype('int')
clf = LinearSVC(random_state=0, tol=1e-5)
k_fold = StratifiedKFold(n_splits=10, shuffle=True, random_state=None)
score = cross_val_score(clf, X, y, cv=k_fold, scoring='accuracy')
print('All - Mean accuracy: %f, Deviation: %f' % (score.mean(),score.std()))

All - Mean accuracy: 0.580515, Deviation: 0.000459
