In [1]:
import numpy as np
import pandas as pd
from nltk.tokenize import WhitespaceTokenizer
from nltk.stem import PorterStemmer
from sklearn.preprocessing import LabelEncoder, OneHotEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import cross_val_score, GridSearchCV

from keras import optimizers
from keras.models import Sequential
from keras.layers import Dropout, Dense, Conv1D, MaxPooling1D, Flatten
from keras.wrappers.scikit_learn import KerasClassifier

Using TensorFlow backend.


# Method to concat all datasets together 

In [2]:
def concat_datasets(train, test, validation):
    COLUMNS_LABELS = ['ID', 'Label', 'Statement', 'Subject', 'Speaker', "Speaker's job",
                     'State info', 'Party affiliation', 'Barely true counts', 'False counts',
                     'Half true counts', 'Mostly true counts', 'Pants on fire counts', 'Venue']
    
    train.columns = test.columns = validation.columns = COLUMNS_LABELS
    
    return pd.concat([train, test, validation])

In [3]:
concat_datasets(pd.read_csv('datasets/train.tsv', sep='\t'),
                pd.read_csv('datasets/test.tsv', sep='\t'),
                pd.read_csv('datasets/valid.tsv', sep='\t')).head(3)

Unnamed: 0,ID,Label,Statement,Subject,Speaker,Speaker's job,State info,Party affiliation,Barely true counts,False counts,Half true counts,Mostly true counts,Pants on fire counts,Venue
0,10540.json,half-true,When did the decline of coal start? It started...,"energy,history,job-accomplishments",scott-surovell,State delegate,Virginia,democrat,0.0,0.0,1.0,1.0,0.0,a floor speech.
1,324.json,mostly-true,"Hillary Clinton agrees with John McCain ""by vo...",foreign-policy,barack-obama,President,Illinois,democrat,70.0,71.0,160.0,163.0,9.0,Denver
2,1123.json,false,Health care reform legislation is likely to ma...,health-care,blog-posting,,,none,7.0,19.0,3.0,5.0,44.0,a news release


# Transforming labels into 'true' or 'false'

In [4]:
def simplify_labels(label):
    true_labels = ['half-true', 'mostly-true', 'true']
    
    return 'true' if label in true_labels else 'false'

In [5]:
df = concat_datasets(pd.read_csv('datasets/train.tsv', sep='\t'),
                pd.read_csv('datasets/test.tsv', sep='\t'),
                pd.read_csv('datasets/valid.tsv', sep='\t'))

In [6]:
df['Label'].head()

0      half-true
1    mostly-true
2          false
3      half-true
4           true
Name: Label, dtype: object

In [7]:
df['Label'].apply(simplify_labels).head()

0     true
1     true
2    false
3     true
4     true
Name: Label, dtype: object

# Splitting dataframe into X and y axis

In [8]:
def split_dataframe(df):
    df['Label'] = df['Label'].apply(simplify_labels)
    
    X = df.iloc[:, 2].values
    y = df.iloc[:, 1].values
    
    return X, y

In [9]:
df = concat_datasets(pd.read_csv('datasets/train.tsv', sep='\t'),
                pd.read_csv('datasets/test.tsv', sep='\t'),
                pd.read_csv('datasets/valid.tsv', sep='\t'))

In [10]:
X, y = split_dataframe(df)

In [11]:
X

array(['When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.',
       'Hillary Clinton agrees with John McCain "by voting to give George Bush the benefit of the doubt on Iran."',
       'Health care reform legislation is likely to mandate free sex change surgeries.',
       ...,
       'John McCain and George Bush have "absolutely no plan for universal health care."',
       "A new poll shows 62 percent support the president's plan to reform health care. That means ... letting you choose between keeping the private insurance you have and a public health insurance plan.",
       'No one claims the report vindicating New Jersey Gov. Chris Christie in the bridge scandal is conclusive.'],
      dtype=object)

In [12]:
y

array(['true', 'true', 'false', ..., 'true', 'false', 'false'],
      dtype=object)

# Splitting X and y axis into train and test data

In [13]:
df = concat_datasets(pd.read_csv('datasets/train.tsv', sep='\t'),
                pd.read_csv('datasets/test.tsv', sep='\t'),
                pd.read_csv('datasets/valid.tsv', sep='\t'))
X, y = split_dataframe(df)

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)

In [15]:
X_train

array(['We havent had a Republican senator in Washington for ... why, I think Clifford Case was our last Republican senator.',
       "Obama's Ten Point Plan to 'Change' The Second Amendment.Ban the manufacture, sale and possession of handguns.",
       'The House of Representatives has never sued a sitting president in all of U.S. history.',
       ...,
       "If you have an investment for your child's education or own a mutual fund or a stock in a retirement plan, (Obama) is going to raise your taxes.",
       'Republicans are attempting to remove Barack Obama from Georgias Presidential Ballot in 2012.',
       "President Obama's own director of national intelligence, Admiral Blair, put it this way: 'High-value information came from interrogations in which those methods were used and provided a deeper understanding of the al-Qaida organization that was attacking this country.'"],
      dtype=object)

In [16]:
y_train

array(['true', 'false', 'true', ..., 'false', 'true', 'true'],
      dtype=object)

# Stemming text

In [17]:
def stemming_documents(documents):
    whitespace_tokenizer = WhitespaceTokenizer()
    stemmer = PorterStemmer()
    stemmed_documents = []
    
    for document in documents:
        sentence = ' '.join([stemmer.stem(word.lower()) for word in whitespace_tokenizer.tokenize(document)])
        stemmed_documents.append(sentence)
    
    return np.array(stemmed_documents, dtype='object')

In [18]:
df = concat_datasets(pd.read_csv('datasets/train.tsv', sep='\t'),
                pd.read_csv('datasets/test.tsv', sep='\t'),
                pd.read_csv('datasets/valid.tsv', sep='\t'))
X, y = split_dataframe(df)

In [19]:
X[0]

'When did the decline of coal start? It started when natural gas took off that started to begin in (President George W.) Bushs administration.'

In [20]:
stemming_documents(X)[0]

'when did the declin of coal start? it start when natur ga took off that start to begin in (presid georg w.) bush administration.'

# Encoding label data

In [21]:
def encode_categorical_data(labels):
    return LabelEncoder().fit_transform(labels)

In [22]:
df = concat_datasets(pd.read_csv('datasets/train.tsv', sep='\t'),
                pd.read_csv('datasets/test.tsv', sep='\t'),
                pd.read_csv('datasets/valid.tsv', sep='\t'))
X, y = split_dataframe(df)

In [23]:
y

array(['true', 'true', 'false', ..., 'true', 'false', 'false'],
      dtype=object)

In [24]:
encode_categorical_data(y)

array([1, 1, 0, ..., 1, 0, 0])

# Preprocessing data 

In [25]:
def preprocess_data(X, y, dataset_type, tf_idf):    
    X = stemming_documents(X)

    if dataset_type == 'train':
        X = tf_idf.fit_transform(X).toarray()
    elif dataset_type == 'test' or dataset_type == 'validation':
        X = tf_idf.transform(X).toarray()
           
    y = encode_categorical_data(y)
    y = y.reshape(-1, 1)

    return X, y

In [26]:
df = concat_datasets(pd.read_csv('datasets/train.tsv', sep='\t'),
                pd.read_csv('datasets/test.tsv', sep='\t'),
                pd.read_csv('datasets/valid.tsv', sep='\t'))
X, y = split_dataframe(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
tf_idf = TfidfVectorizer(max_df=0.5)

In [27]:
X_train[0]

'We havent had a Republican senator in Washington for ... why, I think Clifford Case was our last Republican senator.'

In [28]:
y_train[0]

'true'

In [29]:
X_train, y_train = preprocess_data(X_train, y_train, 'train', tf_idf)
X_test, y_test = preprocess_data(X_test, y_test, 'test', tf_idf)

In [34]:
X_train[0]

array([0., 0., 0., ..., 0., 0., 0.])

In [39]:
y_train[0]

array([1])

# Creating Neural Network model 

In [48]:
def build_ann_classifier(rate=0.1, lr=0.01):
    classifier = Sequential()
    
    classifier.add(Dense(units=512, kernel_initializer='uniform', activation='relu', input_shape=(10229,)))
    classifier.add(Dropout(rate=rate))
    classifier.add(Dense(units=512, kernel_initializer='uniform', activation='relu'))
    classifier.add(Dropout(rate=rate))
    classifier.add(Dense(units=256, kernel_initializer='uniform', activation='relu'))
    classifier.add(Dropout(rate=rate))
    classifier.add(Dense(units=128, kernel_initializer='uniform', activation='relu'))
    classifier.add(Dropout(rate=rate))
    classifier.add(Dense(units=64, kernel_initializer='uniform', activation='relu'))
    classifier.add(Dense(units=32, kernel_initializer='uniform', activation='relu'))
    classifier.add(Dense(units=1, kernel_initializer='uniform', activation='sigmoid'))

#    adam = optimizers.Adam(lr=lr)
#    classifier.compile(optimizer=adam, loss='binary_crossentropy', metrics=['accuracy'])

    rmsprop = optimizers.RMSprop(lr=lr)
    classifier.compile(optimizer=rmsprop, loss='binary_crossentropy', metrics=['accuracy'])

    return classifier

In [40]:
df = concat_datasets(pd.read_csv('datasets/train.tsv', sep='\t'),
                pd.read_csv('datasets/test.tsv', sep='\t'),
                pd.read_csv('datasets/valid.tsv', sep='\t'))
X, y = split_dataframe(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
tf_idf = TfidfVectorizer(max_df=0.5)

X_train, y_train = preprocess_data(X_train, y_train, 'train', tf_idf)
X_test, y_test = preprocess_data(X_test, y_test, 'test', tf_idf)

In [51]:
classifier = build_ann_classifier()
classifier.fit(X_train, y_train, batch_size=50, epochs=3, shuffle=True, validation_split=0.25)

Train on 7193 samples, validate on 2398 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7fee05f0beb8>

# Predict a single news 

In [57]:
def predict_news(news):
    news = np.array(news)
    news = tf_idf.transform(news).toarray()
    
    return classifier.predict(news) 

In [60]:
df = concat_datasets(pd.read_csv('datasets/train.tsv', sep='\t'),
                pd.read_csv('datasets/test.tsv', sep='\t'),
                pd.read_csv('datasets/valid.tsv', sep='\t'))
X, y = split_dataframe(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
tf_idf = TfidfVectorizer(max_df=0.5)

news_test = X_test[0]
news_test_label = y_test[0]

X_train, y_train = preprocess_data(X_train, y_train, 'train', tf_idf)
X_test, y_test = preprocess_data(X_test, y_test, 'test', tf_idf)

classifier = build_ann_classifier()
classifier.fit(X_train, y_train, batch_size=50, epochs=3, shuffle=False, validation_split=0.25)

Train on 7193 samples, validate on 2398 samples
Epoch 1/3
Epoch 2/3
Epoch 3/3


<keras.callbacks.History at 0x7feeb5c0a908>

In [63]:
news_test

'Barack Obama wrote a thesis at Columbia University in which he criticized "plutocratic thugs" and said the Constitution gave Americans "the shackles of hypocrisy."'

In [65]:
news_test_label

'false'

In [67]:
predict_news([news_test])

array([[0.56807]], dtype=float32)

# Doing k-fold cross validation

In [69]:
def k_fold_cross_validation(X, y, build_fn, k, batch, epochs):
    classifier = KerasClassifier(build_fn=build_fn, batch_size=batch, epochs=epochs)
    accuracies = cross_val_score(estimator=classifier, X=X, y=y, cv=k)
    mean = accuracies.mean()
    variance = accuracies.std()
    
    return mean, variance

In [70]:
df = concat_datasets(pd.read_csv('datasets/train.tsv', sep='\t'),
                pd.read_csv('datasets/test.tsv', sep='\t'),
                pd.read_csv('datasets/valid.tsv', sep='\t'))
X, y = split_dataframe(df)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=0)
tf_idf = TfidfVectorizer(max_df=0.5)

X_train, y_train = preprocess_data(X_train, y_train, 'train', tf_idf)
X_test, y_test = preprocess_data(X_test, y_test, 'test', tf_idf)

In [71]:
accuracy, variance = k_fold_cross_validation(X_train, y_train, build_ann_classifier, 5, 50, 3)

Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3
Epoch 1/3
Epoch 2/3
Epoch 3/3


In [73]:
accuracy

0.6030652305147586

In [74]:
variance

0.008634168876144394