In [6]:
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

In [33]:
HEADER = ["ID", "label", "statement", "subject", "speaker", "speaker's job", "state info", "party", "barely true counts", "false counts", "half true counts", "mostly true counts", "pants on fire counts", "venue"]
train = pd.read_csv('data/train.csv', header=None)
train.columns = HEADER

In [34]:
train.tail(2)

Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue
10238,2253.json,FALSE,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,democrat,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...
10239,1155.json,pants-fire,The Department of Veterans Affairs has a manua...,"health-care,veterans",michael-steele,chairman of the Republican National Committee,Maryland,republican,0.0,1.0,1.0,0.0,2.0,a Fox News interview


In [13]:
test = pd.read_csv('data/test.csv', header=None)
test.columns = HEADER

In [14]:
both = [train, test]
combined = pd.concat(both)
combined.shape

(11507, 14)

In [36]:
simple_train = combined.copy()

def set_simple_labels(df):
    """True-ish is 2, kinda true is 1, false-ish is 0"""
    new_labels = []
    for l in df.label:
        if l == 'pants-fire' or l == 'FALSE':
            new_labels.append(0)
        elif l == "barely-true" or l == "half-true":
            new_labels.append(1)
        elif l == "mostly-true" or l == "TRUE":
            new_labels.append(2)
        else:
            assert False, "{} is not a normal label".format(l)
    df.label = new_labels

    
set_simple_labels(train)
# simple_label = simplify_labels(combined.label)
# df2 = pd.DataFrame(simple_label, columns=list('A'))
# print(df2)
# simple_train.label = simple_label
# simple_train.append(df2)
# simple_train.head(5)

AssertionError: 0 is not a normal label

In [37]:
train.head(5)

Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,0,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [38]:
simple_train_b = combined.copy()

def set_party(df):
    """Conservative Spectrum"""
    party = []
    for p in df.party:
        if p == 'republican':
            party.append(1)
        elif p == "democrat" :
            party.append(0)
        else:
            party.append(.5)
    df.party = party
    
set_party(train)

In [39]:
train.head(5)

Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,1.0,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,0.0,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,0.0,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,0,Health care reform legislation is likely to ma...,health-care,blog-posting,,,0.5,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,0.0,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [17]:
simple_train_c = combined.copy()
#for index, row in combined.iterrows():
    #print(index)
    #print(row['mostly true counts'])
def simplify_labels3(df):
    """True-ish is 2, kinda true is 1, false-ish is 0"""
    new_labels = []
    for index, row in df.iterrows():
        ratio = (row['mostly true counts']+1) / ((row['barely true counts'] + row['false counts'] + row['pants on fire counts'])+1)
        new_labels.append(ratio)
    return new_labels




simple_label_c = simplify_labels3(combined)
#simple_train_c.subject = simple_label_b



In [18]:
y_onehot = pd.get_dummies(train.label)
raw_X = train.statement
raw_X_test = test.statement
raw_combined = combined.statement
simple_combined = simple_train.statement

In [19]:
vectorizer = CountVectorizer()
#X = vectorizer.fit_transform(raw_X)
#X_test = vectorizer.fit_transform(raw_X_test)
combined_data = vectorizer.fit_transform(raw_combined)
simple_data = vectorizer.fit_transform(simple_combined)
print(combined_data.shape)
print(raw_X.shape)
print(y_onehot.shape)
print(train.label.shape)
print(raw_X_test.shape)
print(simple_data.shape)

(11507, 12873)
(10240,)
(10240, 6)
(10240,)
(1267,)
(11507, 12873)


In [20]:
#Train Test Split
#Could use same X w/ different labels
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     combined_data, combined.label, test_size=0.25, random_state=42)
X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(
     simple_data, simple_train.label, test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8630, 12873) (2877, 12873) (8630,) (2877,)


In [21]:
#With 6 Labels
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X_train, y_train)



In [22]:
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))
results = clf.predict(X_train)
print(results)

0.8909617612977984
0.22384428223844283
['half-true' 'FALSE' 'FALSE' ... 'barely-true' 'FALSE' 'FALSE']


In [23]:
#Simplified Labels
from sklearn.linear_model import LogisticRegression
clf_simple = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X_train_simple, y_train_simple)



In [24]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train_simple, y_train_simple)

In [25]:
print(clf_simple.score(X_train_simple, y_train_simple))
print(clf_simple.score(X_test_simple, y_test_simple))
results_simple = clf_simple.predict(X_train_simple)
print(results_simple)
results_lin = reg.predict(X_train_simple)
print(results_lin)

0.8993047508690614
0.3986791797010775
[1 0 0 ... 1 0 0]
[ 7.56099609e-01 -4.46279214e-06  2.70565099e-06 ...  1.00000478e+00
 -5.67093891e-06 -3.11068651e-06]


In [26]:
#Make input for second model
new_input = []
print(simple_train_b.party[0])
print(simple_label_c[47])
for i, row in enumerate(results_simple):
    #print(simple_label_b[i])
    new_input.append((row, simple_train_b.party[i], simple_label_c[i]))
    #print(i)
#print(new_input)

0    1.0
0    1.0
Name: party, dtype: float64
0.13157894736842105


In [27]:
print(len(y_train_simple))
print(y_train_simple[0])
print(len(new_input))
reg2 = LinearRegression().fit(new_input[0], y_train_simple[0])

8630
2
8630


ValueError: Expected 2D array, got 1D array instead:
array=[1 0    1.0
0    1.0
Name: party, dtype: float64 0.5].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
pipe = make_pipeline(CountVectorizer(), LogisticRegression())
pipe.fit(X, train.label)

In [None]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train.toarray(), y_train).predict(X_train.toarray())
print(y_pred)

In [None]:
print("Number of mislabeled points out of a total %d points : %d"
      % (X_train.shape[0],(y_train != y_pred).sum()))

In [None]:
gnb2 = GaussianNB()
y_pred = gnb2.fit(X_train.toarray(), y_train).predict(X_test.toarray())

In [None]:
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0],(y_test != y_pred).sum()))

In [28]:
train.head(5)

Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue
0,2635.json,FALSE,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,FALSE,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [40]:
def append_lying_ratio(df):
    """
    creates a weighted average of truth history from 0 to 1 per statement
    appends it to the train df as 'lying ratio'
    the value ranges from 0 to 1 
    where 0 is they always tell truth and 1 is they always lie
    if they don't have any history set as a random number (will be offset by 'truth history' which will be set to 0)
    
    TODO: subtract current label since included in counts (recommended in LIAR paper)
    """
    
    ratio = []
    
    fire = df['pants on fire counts'] 
    false = df['false counts'] 
    barely = df['barely true counts']
    half = df['half true counts'] 
    mostly = df['mostly true counts'] 
    
    FIRE_W = 1
    FALSE_W = .8
    BARELY_W = .6
    HALF_W = .4
    MOSTLY_W = .2
    
    RANDOM = .5 #if no history set to this value
    
    for i in range(len(train)):
        avg = 0
        avg += fire[i] * FIRE_W
        avg += false[i] * FALSE_W
        avg += barely[i] * BARELY_W
        avg += half[i] * HALF_W
        avg += mostly[i] * MOSTLY_W
        
        total = fire[i] + false[i] + barely[i] + half[i] + mostly[i]
        
        if total == 0:
            ratio.append(RANDOM)
        else:
            avg /= total
            ratio.append(avg)
        
    df['lying ratio'] = ratio
    
append_lying_ratio(train)

In [41]:
def append_ratio_significance(df):
    """
    sets significance as a value between 0 and 1
    appends it to df as 'ratio significance'
    """
    sig =  []
    
    fire = df['pants on fire counts'] 
    false = df['false counts'] 
    barely = df['barely true counts']
    half = df['half true counts'] 
    mostly = df['mostly true counts']
    
    CUTOFF = 100
    
    for i in range(len(train)):
        total = fire[i] + false[i] + barely[i] + half[i] + mostly[i]
        if total > CUTOFF:
            sig.append(1)
        else:
            sig.append(total / CUTOFF)
            
    df['ratio significance'] = sig
    
append_ratio_significance(train)

In [42]:
train.head(5)

Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue,lying ratio,ratio significance
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,1.0,0.0,1.0,0.0,0.0,0.0,a mailer,0.8,0.01
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,0.0,0.0,0.0,1.0,1.0,0.0,a floor speech.,0.3,0.02
2,324.json,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,0.0,70.0,71.0,160.0,163.0,9.0,Denver,0.432135,1.0
3,1123.json,0,Health care reform legislation is likely to ma...,health-care,blog-posting,,,0.5,7.0,19.0,3.0,5.0,44.0,a news release,0.841026,0.78
4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,0.0,15.0,9.0,20.0,19.0,2.0,an interview on CNN,0.461538,0.65


In [43]:
import nltk
nltk.download('opinion_lexicon')

[nltk_data] Downloading package opinion_lexicon to
[nltk_data]     /Users/mauriciow/nltk_data...
[nltk_data]   Package opinion_lexicon is already up-to-date!


True

In [54]:
def possessive_pronouns(sentence):
    #need to parse things like "theyre"
    l = ["i", "me", "my", "mine", "we", "us", "our", "ours"] 
    count = 0
    for w in sentence.split(' '):
        if w in l:
            count += 1
    return count

def negations(sentence):
    l = ["no", "not", "neither", "never", "no one", "nobody", "none", "nor", "nothing", "nowhere"]
    count = 0
    for w in sentence.split(' '):
        if w in l:
            count += 1
    return count

def cognitive_complexity(sentence):
    l = ["than", "rather than", "whether", "as much as", "whereas", "though", "although", "even though", "while", "if", "only if",
    "unless", "until", "providing that", "assuming that", "even if", "in case", "in case that", "lest"]
    count = 0
    for w in l:
        count += sentence.count(w)
    return count

In [55]:
def append_linguistic(df, f, name):
    counts = []
    for s in df.statement:
        counts.append(f(s))
    df[name] = counts
    
append_linguistic(train, possessive_pronouns, 'possessive')
append_linguistic(train, negations, 'negations')
append_linguistic(train, cognitive_complexity, 'complexity')



Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue,lying ratio,ratio significance,possessive,negations,complexity
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,1.0,0.0,1.0,0.0,0.0,0.0,a mailer,0.8,0.01,0,0,0
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,0.0,0.0,0.0,1.0,1.0,0.0,a floor speech.,0.3,0.02,0,0,0
2,324.json,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,0.0,70.0,71.0,160.0,163.0,9.0,Denver,0.432135,1.0,0,0,0
3,1123.json,0,Health care reform legislation is likely to ma...,health-care,blog-posting,,,0.5,7.0,19.0,3.0,5.0,44.0,a news release,0.841026,0.78,0,0,0
4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,0.0,15.0,9.0,20.0,19.0,2.0,an interview on CNN,0.461538,0.65,1,0,0
5,12465.json,2,The Chicago Bears have had more starting quart...,education,robin-vos,Wisconsin Assembly speaker,Wisconsin,1.0,0.0,3.0,2.0,5.0,1.0,a an online opinion-piece,0.472727,0.11,0,0,1
6,2342.json,1,Jim Dunnam has not lived in the district he re...,candidates-biography,republican-party-texas,,Texas,1.0,3.0,1.0,1.0,3.0,1.0,a press release.,0.511111,0.09,0,1,0
7,153.json,1,I'm the only person on this stage who has work...,ethics,barack-obama,President,Illinois,0.0,70.0,71.0,160.0,163.0,9.0,"a Democratic debate in Philadelphia, Pa.",0.432135,1.0,0,0,0
8,5602.json,1,"However, it took $19.5 million in Oregon Lotte...",jobs,oregon-lottery,,,0.5,0.0,0.0,1.0,0.0,1.0,a website,0.7,0.02,0,0,1
9,9741.json,2,Says GOP primary opponents Glenn Grothman and ...,"energy,message-machine-2014,voting-record",duey-stroebel,State representative,Wisconsin,1.0,0.0,0.0,0.0,1.0,0.0,an online video,0.2,0.01,0,0,0


In [56]:
train.tail(2)

Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue,lying ratio,ratio significance,possessive,negations,complexity
10238,2253.json,0,On lifting the U.S. Cuban embargo and allowing...,"florida,foreign-policy",jeff-greene,,Florida,0.0,3.0,1.0,3.0,0.0,0.0,a televised debate on Miami's WPLG-10 against ...,0.542857,0.07,0,0,1
10239,1155.json,0,The Department of Veterans Affairs has a manua...,"health-care,veterans",michael-steele,chairman of the Republican National Committee,Maryland,1.0,0.0,1.0,1.0,0.0,2.0,a Fox News interview,0.8,0.04,1,0,0


In [61]:
train[['label', 'statement', 'speaker', 'venue', 'lying ratio','ratio significance', 'possessive', 'negations', 'complexity']].tail(2)

Unnamed: 0,label,statement,speaker,venue,lying ratio,ratio significance,possessive,negations,complexity
10238,0,On lifting the U.S. Cuban embargo and allowing...,jeff-greene,a televised debate on Miami's WPLG-10 against ...,0.542857,0.07,0,0,1
10239,0,The Department of Veterans Affairs has a manua...,michael-steele,a Fox News interview,0.8,0.04,1,0,0


In [45]:
positive = set(nltk.opinion_lexicon.positive())
negative = set(opinion_lexicon.negative())

def negative_sentiment(sentence):
    return Counter([w for w in sentence.split(' ') if w in positive]) - Counter([w for w in sentence.split(' ') if w in negative]) 
#use a sentiment treebank analysis on sentiment instead of using raw counts?

AttributeError: module 'nltk' has no attribute 'opinion_lexicon'

In [None]:
#add possesive pronoun counts to training data
pronoun_counts = []
for s in train.statement:
    pronoun_counts.append( possessive_pronouns(s) )
simple_train['pronouns'] = pronoun_counts

In [None]:
def cognitive_complexity(sentence):
    l = ["than", "rather than", "whether", "as much as", "whereas", "though", "although", "even though", "while", "if", "only if",
    "unless", "until", "providing that", "assuming that", "even if", "in case", "in case that", "lest"]
    for w in sentence.split(' '):
        if w in l:
            count += 1
    return count