In [8]:
import pandas as pd
from nltk.tokenize import word_tokenize
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer, TfidfTransformer
from sklearn.linear_model import SGDClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import f1_score, accuracy_score
from sklearn.pipeline import Pipeline

In [9]:
train_df = pd.read_csv('raw_data/fulltrain.csv', header=None)
train_df.columns = ['label', 'text']
print(train_df.head())
print(train_df.describe())

   label                                               text
0      1  A little less than a decade ago, hockey fans w...
1      1  The writers of the HBO series The Sopranos too...
2      1  Despite claims from the TV news outlet to offe...
3      1  After receiving 'subpar' service and experienc...
4      1  After watching his beloved Seattle Mariners pr...
              label
count  48854.000000
mean       2.487432
std        1.110903
min        1.000000
25%        1.000000
50%        3.000000
75%        3.000000
max        4.000000


In [10]:
# # 2-way dataset - satire and hoax
# labels = [1, 2]
# train_df = train_df[train_df['label'].isin(labels)]

train_df.describe()

Unnamed: 0,label
count,48854.0
mean,2.487432
std,1.110903
min,1.0
25%,1.0
50%,3.0
75%,3.0
max,4.0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(train_df['text'], train_df['label'])

## Linear Support Vector Machine

In [12]:
# CountVectorizer - counts number of times a word appears in text
# TfidfTransformer - weights words from count matrix (via CountVectorizer)
sgd = Pipeline([('vect', CountVectorizer(tokenizer=word_tokenize, stop_words='english')),
                ('tfidf', TfidfTransformer(sublinear_tf=True)),
                ('clf', SGDClassifier(loss='hinge', penalty='l2',alpha=1e-3, random_state=0, max_iter=5, tol=None)),
               ])

sgd.fit(X_train, y_train)

y_pred = sgd.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred, average='macro')}")

Accuracy: 0.9375307024725724
F1 score: 0.9335375235028938


## balancedtest.csv Results

In [14]:
test_df = pd.read_csv('raw_data/balancedtest.csv', header=None)
test_df.columns = ['label', 'text']

# test_df = test_df[test_df['label'].isin(labels)]

X_test = test_df['text']
y_test = test_df['label']

y_pred = sgd.predict(X_test)

print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred, average='macro')}")

Accuracy: 0.6113333333333333
F1 score: 0.6025226662241085
