In [1]:
import pandas
import numpy
from sklearn.model_selection import train_test_split, KFold
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import PassiveAggressiveClassifier
from sklearn.metrics import accuracy_score, confusion_matrix, f1_score, recall_score, precision_score

In [2]:
number_of_folds = 2

In [3]:
# Read in the data
data_frame = pandas.read_csv('../data/news.csv')  # read_csv returns a pandas.DataFrame

In [4]:
# Visually verify that we read in the data correctly
print('Dimensions of data:', data_frame.shape)
print('First few items in data:')
print(data_frame.head())

Dimensions of data: (6335, 4)
First few items in data:
      id                                              title  \
0   8476                       You Can Smell Hillary’s Fear   
1  10294  Watch The Exact Moment Paul Ryan Committed Pol...   
2   3608        Kerry to go to Paris in gesture of sympathy   
3  10142  Bernie supporters on Twitter erupt in anger ag...   
4    875   The Battle of New York: Why This Primary Matters   

                                                text label  
0  Daniel Greenfield, a Shillman Journalism Fello...  FAKE  
1  Google Pinterest Digg Linkedin Reddit Stumbleu...  FAKE  
2  U.S. Secretary of State John F. Kerry said Mon...  REAL  
3  — Kaydee King (@KaydeeKing) November 9, 2016 T...  FAKE  
4  It's primary day in New York and front-runners...  REAL  


In [5]:
# Construct a new column of <title + text>, that'll be the combined input textual data
data_frame['combined'] = data_frame['title'] + ' / ' + data_frame['text']
print(data_frame['combined'])

0       You Can Smell Hillary’s Fear / Daniel Greenfie...
1       Watch The Exact Moment Paul Ryan Committed Pol...
2       Kerry to go to Paris in gesture of sympathy / ...
3       Bernie supporters on Twitter erupt in anger ag...
4       The Battle of New York: Why This Primary Matte...
                              ...                        
6330    State Department says it can't find emails fro...
6331    The ‘P’ in PBS Should Stand for ‘Plutocratic’ ...
6332    Anti-Trump Protesters Are Tools of the Oligarc...
6333    In Ethiopia, Obama seeks progress on peace, se...
6334    Jeb Bush Is Suddenly Attacking Trump. Here's W...
Name: combined, Length: 6335, dtype: object


In [6]:
# Convert from pandas.DataFrame to numpy.array
# because KFold.split() takes "plain" arrays as parameters
data_array = numpy.array(data_frame['combined'])
label_array = numpy.array(data_frame['label'])

In [7]:
def learn_vocabulary(tfidf_vectorizer, data_train):
    # Learn vocabulary, learn tf-idf matrix with training data, and return matrix
    tfidf_train = tfidf_vectorizer.fit_transform(data_train)
    # Now vocabulary is fixed
    
    return tfidf_vectorizer, tfidf_train

In [8]:
def apply_tfidf(tfidf_vectorizer, data_test):
    # Apply tf-idf matrix to test data, and return matrix
    tfidf_test = tfidf_vectorizer.transform(data_test)
    
    return tfidf_vectorizer, tfidf_test

In [9]:
def initialize_classifier():
    # Initialize a PassiveAggressiveClassifier, that's our machine learning method
    # Documentation:
    # https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.PassiveAggressiveClassifier.html
    classifier = PassiveAggressiveClassifier(max_iter=50)  # Max iterations
    
    return classifier

In [10]:
def train_classifier(classifier, tfidf_train, label_train):
    # Train our classifier, i.e. train our model, i.e. run the machine learning method
    classifier.fit(tfidf_train, label_train)
    
    return classifier

In [11]:
def apply_classifier(classifier, tfidf_test):
    # Apply classifier to test data, i.e. predict labels for test data
    label_predicted = classifier.predict(tfidf_test)
    
    return label_predicted

In [12]:
def print_results(label_test, label_predicted):
    # Print results
    accuracy = accuracy_score(label_test, label_predicted)
    precision_real = precision_score(label_test, label_predicted, pos_label='REAL')
    precision_fake = precision_score(label_test, label_predicted, pos_label='FAKE')
    recall_real = recall_score(label_test, label_predicted, pos_label='REAL')
    recall_fake = recall_score(label_test, label_predicted, pos_label='FAKE')
    f1_real = f1_score(label_test, label_predicted, pos_label='REAL')
    f1_fake = f1_score(label_test, label_predicted, pos_label='FAKE')
    print()
    print('Accuracy:', accuracy)
    print('Precision REAL:', precision_real, 'FAKE:', precision_fake)
    print('Recall    REAL:', recall_real, 'FAKE:', recall_fake)
    print('F1        REAL:', f1_real, 'FAKE:', f1_fake)
    print()
    print('Confusion matrix:')
    print(confusion_matrix(label_test, label_predicted, labels=['REAL', 'FAKE']))
    
    return accuracy

In [13]:
# Split the dataset into k train + test folds
kfold = KFold(n_splits=number_of_folds)
fold = 1
accuracy_accumulated = 0.0

for train_index, test_index in kfold.split(data_array):
    # Beginning of one fold
    print('---------- fold', fold, '----------')
    print()
    
    data_train, data_test = data_array[train_index], data_array[test_index]
    label_train, label_test = label_array[train_index], label_array[test_index]
    
    # For debugging purposes
    #print('Test data:')
    #print(data_test)
    #print(data_test.shape)

    #print('Test labels:')
    #print(label_test)

    #print('Train data:')
    #print(data_train)
    #print(data_train.shape)

    #print('Train labels:')
    #print(label_train)
    
    # To visualize which part of the data is train and which part is test
    print('Train:', train_index, ' --- Test:', test_index)
    
    # Initialize a TfidfVectorizer
    # Documentation:
    # https://scikit-learn.org/stable/modules/generated/sklearn.feature_extraction.text.TfidfVectorizer.html
    tfidf_vectorizer = TfidfVectorizer(
        stop_words='english',  # E.g. a, the, we, are, therefore, where
        max_df=0.7)  # Max document frequency
    
    tfidf_vectorizer, tfidf_train = learn_vocabulary(tfidf_vectorizer, data_train)
    tfidf_vectorizer, tfidf_test = apply_tfidf(tfidf_vectorizer, data_test)
    
    # Initialize a PassiveAggressiveClassifier
    classifier = initialize_classifier()
    
    # Train our classifier, i.e. train our model, i.e. run the machine learning method
    classifier = train_classifier(classifier, tfidf_train, label_train)
    
    # Apply classifier to test data, i.e. predict labels for test data
    label_predicted = apply_classifier(classifier, tfidf_test)
    
    # Print results and save the accuracy
    accuracy = print_results(label_test, label_predicted)
    accuracy_accumulated += accuracy
    
    # End of one fold
    print()
    fold += 1
    
# Report average accuracy
accuracy_average = accuracy_accumulated / number_of_folds
print('Average accuracy:', accuracy_average)

---------- fold 1 ----------

Train: [3168 3169 3170 ... 6332 6333 6334]  --- Test: [   0    1    2 ... 3165 3166 3167]

Accuracy: 0.9327651515151515
Precision REAL: 0.9413298565840938 FAKE: 0.9247246022031824
Recall    REAL: 0.9215060625398851 FAKE: 0.943785134291068
F1        REAL: 0.9313124798452113 FAKE: 0.9341576506955177

Confusion matrix:
[[1444  123]
 [  90 1511]]

---------- fold 2 ----------

Train: [   0    1    2 ... 3165 3166 3167]  --- Test: [3168 3169 3170 ... 6332 6333 6334]

Accuracy: 0.9239027470792548
Precision REAL: 0.9288860918816866 FAKE: 0.9188846641318125
Recall    REAL: 0.9201995012468828 FAKE: 0.927703134996801
F1        REAL: 0.9245223927341059 FAKE: 0.9232728430436167

Confusion matrix:
[[1476  128]
 [ 113 1450]]

Average accuracy: 0.9283339492972031
