In [3]:
import pandas as pd
from sklearn import preprocessing
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.pipeline import make_pipeline

In [4]:
HEADER = ["ID", "label", "statement", "subject", "speaker", "speaker's job", "state info", "party", "barely true counts", "false counts", "half true counts", "mostly true counts", "pants on fire counts", "venue"]
train = pd.read_csv('data/train.csv', header=None)
train.columns = HEADER

In [5]:
test = pd.read_csv('data/test.csv', header=None)
test.columns = HEADER

In [None]:
both = [train, test]
combined = pd.concat(both)
combined.shape

In [166]:
simple_train = combined.copy()

def simplify_labels(label):
    """True-ish is 2, kinda true is 1, false-ish is 0"""
    new_labels = []
    for l in label:
        if l == 'pants-fire' or l == 'FALSE':
            new_labels.append(0)
        elif l == "barely-true" or l == "half-true":
            new_labels.append(1)
        elif l == "mostly-true" or l == "TRUE":
            new_labels.append(2)
        else:
            assert False, "{} is not a normal label".format(l)
    return new_labels

simple_label = simplify_labels(combined.label)
df2 = pd.DataFrame(simple_label, columns=list('A'))
print(df2)
simple_train.label = simple_label
simple_train.append(df2)
simple_train.head(5)

of pandas will change to not sort by default.

To accept the future behavior, pass 'sort=False'.


  sort=sort)


Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue
0,2635.json,0,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,1,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,2,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,0,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,1,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [134]:
simple_train_b = combined.copy()

def simplify_labels2(label):
    """Conservative Spectrum"""
    new_labels = []
    for l in label:
        if l == 'republican':
            new_labels.append(1)
        elif l == "democrat" :
            new_labels.append(0)
        else:
            new_labels.append(.5)
    return new_labels

simple_label_b = simplify_labels2(combined.party)
simple_train_b.party = simple_label_b
simple_train_b.head(5)

Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue
0,2635.json,FALSE,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,1.0,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,0.0,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,0.0,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,FALSE,Health care reform legislation is likely to ma...,health-care,blog-posting,,,0.5,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,0.0,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [135]:
simple_train_c = combined.copy()
#for index, row in combined.iterrows():
    #print(index)
    #print(row['mostly true counts'])
def simplify_labels3(df):
    """True-ish is 2, kinda true is 1, false-ish is 0"""
    new_labels = []
    for index, row in df.iterrows():
        ratio = (row['mostly true counts']+1) / ((row['barely true counts'] + row['false counts'] + row['pants on fire counts'])+1)
        new_labels.append(ratio)
    return new_labels
simple_label_c = simplify_labels3(combined)
#simple_train_c.subject = simple_label_b



In [136]:
y_onehot = pd.get_dummies(train.label)
raw_X = train.statement
raw_X_test = test.statement
raw_combined = combined.statement
simple_combined = simple_train.statement

In [137]:
vectorizer = CountVectorizer()
#X = vectorizer.fit_transform(raw_X)
#X_test = vectorizer.fit_transform(raw_X_test)
combined_data = vectorizer.fit_transform(raw_combined)
simple_data = vectorizer.fit_transform(simple_combined)
print(combined_data.shape)
print(raw_X.shape)
print(y_onehot.shape)
print(train.label.shape)
print(raw_X_test.shape)
print(simple_data.shape)

(11507, 12873)
(10240,)
(10240, 6)
(10240,)
(1267,)
(11507, 12873)


In [138]:
#Train Test Split
#Could use same X w/ different labels
import numpy as np
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
     combined_data, combined.label, test_size=0.25, random_state=42)
X_train_simple, X_test_simple, y_train_simple, y_test_simple = train_test_split(
     simple_data, simple_train.label, test_size=0.25, random_state=42)
print(X_train.shape, X_test.shape, y_train.shape, y_test.shape)

(8630, 12873) (2877, 12873) (8630,) (2877,)


In [139]:
#With 6 Labels
from sklearn.linear_model import LogisticRegression
clf = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X_train, y_train)



In [140]:
print(clf.score(X_train, y_train))
print(clf.score(X_test, y_test))
results = clf.predict(X_train)
print(results)

0.8909617612977984
0.22384428223844283
['half-true' 'FALSE' 'FALSE' ... 'barely-true' 'FALSE' 'FALSE']


In [141]:
#Simplified Labels
from sklearn.linear_model import LogisticRegression
clf_simple = LogisticRegression(random_state=0, solver='lbfgs',
                         multi_class='multinomial').fit(X_train_simple, y_train_simple)



In [142]:
from sklearn.linear_model import LinearRegression
reg = LinearRegression().fit(X_train_simple, y_train_simple)

In [143]:
print(clf_simple.score(X_train_simple, y_train_simple))
print(clf_simple.score(X_test_simple, y_test_simple))
results_simple = clf_simple.predict(X_train_simple)
print(results_simple)
results_lin = reg.predict(X_train_simple)
print(results_lin)

0.8993047508690614
0.3986791797010775
[1 0 0 ... 1 0 0]
[ 7.56099609e-01 -4.46279214e-06  2.70565099e-06 ...  1.00000478e+00
 -5.67093891e-06 -3.11068651e-06]


In [152]:
#Make input for second model
new_input = []
print(simple_train_b.party[0])
print(simple_label_c[47])
for i, row in enumerate(results_simple):
    #print(simple_label_b[i])
    new_input.append((row, simple_train_b.party[i], simple_label_c[i]))
    #print(i)
#print(new_input)

0    1.0
0    1.0
Name: party, dtype: float64
0.13157894736842105


In [149]:
print(len(y_train_simple))
print(y_train_simple[0])
print(len(new_input))
reg2 = LinearRegression().fit(new_input[0], y_train_simple[0])

8630
2
(1, 0    1.0
0    1.0
Name: party, dtype: float64, 0.5)
8630


ValueError: Expected 2D array, got 1D array instead:
array=[1 0    1.0
0    1.0
Name: party, dtype: float64 0.5].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [118]:
pipe = make_pipeline(CountVectorizer(), LogisticRegression())
pipe.fit(X, train.label)

NameError: name 'X' is not defined

In [58]:
from sklearn.naive_bayes import GaussianNB
gnb = GaussianNB()
y_pred = gnb.fit(X_train.toarray(), y_train).predict(X_train.toarray())
print(y_pred)

['pants-fire' 'FALSE' 'barely-true' ... 'barely-true' 'FALSE' 'FALSE']


In [121]:
print("Number of mislabeled points out of a total %d points : %d"
      % (X_train.shape[0],(y_train != y_pred).sum()))

Number of mislabeled points out of a total 8630 points : 2501


In [123]:
gnb2 = GaussianNB()
y_pred = gnb2.fit(X_train.toarray(), y_train).predict(X_test.toarray())

Number of mislabeled points out of a total 2877 points : 2343


In [None]:
print("Number of mislabeled points out of a total %d points : %d"
      % (X_test.shape[0],(y_test != y_pred).sum()))

In [6]:
train.head(5)

Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue
0,2635.json,FALSE,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
3,1123.json,FALSE,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN


In [27]:
def append_lying_ratio(df):
    """
    creates a weighted average of truth history from 0 to 1 per statement
    appends it to the train df as 'lying ratio'
    the value ranges from 0 to 1 
    where 0 is they always tell truth and 1 is they always lie
    if they don't have any history set as a random number (will be offset by 'truth history' which will be set to 0)
    
    TODO: subtract current label since included in counts (recommended in LIAR paper)
    """
    
    ratio = []
    
    fire = df['pants on fire counts'] 
    false = df['false counts'] 
    barely = df['barely true counts']
    half = df['half true counts'] 
    mostly = df['mostly true counts'] 
    
    FIRE_W = 1
    FALSE_W = .8
    BARELY_W = .6
    HALF_W = .4
    MOSTLY_W = .2
    
    RANDOM = .5 #if no history set to this value
    
    for i in range(len(train)):
        avg = 0
        avg += fire[i] * FIRE_W
        avg += false[i] * FALSE_W
        avg += barely[i] * BARELY_W
        avg += half[i] * HALF_W
        avg += mostly[i] * MOSTLY_W
        
        total = fire[i] + false[i] + barely[i] + half[i] + mostly[i]
        
        if total == 0:
            ratio.append(RANDOM)
        else:
            avg /= total
            ratio.append(avg)
        
    df['lying ratio'] = ratio
    
append_lying_ratio(train)

In [24]:
def append_ratio_significance(df):
    """
    sets significance as a value between 0 and 1
    appends it to df as 'ratio significance'
    """
    sig =  []
    
    fire = df['pants on fire counts'] 
    false = df['false counts'] 
    barely = df['barely true counts']
    half = df['half true counts'] 
    mostly = df['mostly true counts']
    
    CUTOFF = 100
    
    for i in range(len(train)):
        total = fire[i] + false[i] + barely[i] + half[i] + mostly[i]
        if total > CUTOFF:
            sig.append(1)
        else:
            sig.append(total / CUTOFF)
            
    df['ratio significance'] = sig
    
append_ratio_significance(train)

In [30]:
train.head(5)

Unnamed: 0,ID,label,statement,subject,speaker,speaker's job,state info,party,barely true counts,false counts,half true counts,mostly true counts,pants on fire counts,venue,lying ratio,ratio significance
0,2635.json,FALSE,Says the Annies List political group supports ...,abortion,dwayne-bohac,State representative,Texas,republican,0.0,1.0,0.0,0.0,0.0,a mailer,0.8,0.01
1,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.,0.3,0.02
2,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver,0.432135,1.0
3,1123.json,FALSE,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release,0.841026,0.78
4,9028.json,half-true,The economic turnaround started at the end of ...,"economy,jobs",charlie-crist,,Florida,democrat,15.0,9.0,20.0,19.0,2.0,an interview on CNN,0.461538,0.65
