# CS4248 Project - Labelled Unreliable News (LUN)

In [69]:
import nltk
import numpy as np
import pandas as pd
import string

from nltk.corpus import stopwords
from nltk.stem import PorterStemmer, WordNetLemmatizer
from nltk.tokenize import word_tokenize
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import f1_score, classification_report, accuracy_score
from sklearn.model_selection import train_test_split


nltk.download('wordnet')
stop_words = set(stopwords.words('english'))
# stemmer = PorterStemmer()
lemmatizer = WordNetLemmatizer()
stemmer = None
# lemmatizer = None

TEST_SIZE = 0.1
SMOOTHING = 1.0
NGRAM_RANGE = (1, 1)

[nltk_data] Downloading package wordnet to /Users/allard/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


## Helper Functions

In [70]:
def preprocess(sentence, lower_case=True, remove_punctuation=True):
    if lower_case:
        sentence = sentence.lower()
    if remove_punctuation:
        sentence = sentence.translate(str.maketrans('', '', string.punctuation))

    return sentence

In [71]:
def tokenize(sentence, stemmer=stemmer, lemmatizer=lemmatizer, remove_stop_words=False):
    tokens = word_tokenize(sentence)
    
    if remove_stop_words:
        tokens = [token for token in tokens if token not in stop_words]
    if stemmer:
        tokens = [stemmer.stem(token) for token in tokens]
    if lemmatizer:
        tokens = [lemmatizer.lemmatize(token) for token in tokens]
        
    return tokens

## Load Data

In [72]:
import pandas as pd

full_train_df = pd.read_csv('raw_data/fulltrain.csv', header=None)
full_train_df.columns = ['label', 'text']
print(len(full_train_df))
full_train_df.head()

48854


Unnamed: 0,label,text
0,1,"A little less than a decade ago, hockey fans w..."
1,1,The writers of the HBO series The Sopranos too...
2,1,Despite claims from the TV news outlet to offe...
3,1,After receiving 'subpar' service and experienc...
4,1,After watching his beloved Seattle Mariners pr...


In [73]:
train_df = full_train_df.drop_duplicates(subset=['text'])
subset_df = train_df[train_df['label'].isin([1, 2])]
print(f"No. training samples (all classes): {len(train_df)}")
print(f"No. training samples (classes 1 and 2): {len(subset_df)}")

No. training samples (all classes): 48652
No. training samples (classes 1 and 2): 20850


## Training

In [74]:
X = subset_df['text'].values
y = subset_df['label'].values
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=TEST_SIZE, stratify=y, random_state=42)
X_train.shape, X_test.shape

((18765,), (2085,))

In [75]:
tfidf_vectorizer = TfidfVectorizer(ngram_range=NGRAM_RANGE, smooth_idf=True, preprocessor=preprocess, tokenizer=tokenize, token_pattern=None)
# Uncomment for default TfidfVectorizer
# tfidf_vectorizer = TfidfVectorizer(ngram_range=NGRAM_RANGE, smooth_idf=True)

In [76]:
X_train_tfidf = tfidf_vectorizer.fit_transform(X_train)
feature_names = tfidf_vectorizer.get_feature_names_out()
print(f"Vocabulary size: {len(feature_names)}")
print(feature_names[:150])

Vocabulary size: 107168
['0' '00' '000' '00000000' '00000000001' '0000000001ounce' '000000003'
 '00000001' '0000001' '0000004' '000002' '000006' '000013s' '00003'
 '00004' '00005' '0001' '00010010101010101010101010101010' '00010052'
 '0003' '0004' '000567kln00067q' '001' '0010'
 '001000011011010101010101010101010101010010' '002' '003' '0035' '007'
 '0073735963' '00893' '009siam' '01' '010' '0100' '0100010' '0101010'
 '0110' '0115' '012' '014' '016' '017' '018' '01ers' '01oz' '02' '020'
 '021' '025' '03' '0302' '03130' '031mile' '03823' '038d' '03squaremile'
 '04' '042684425' '045' '047' '05' '0543' '0563' '0586' '05k' '06' '064'
 '07' '072' '075off' '078' '08' '085' '087centimeter' '08and' '08ounce'
 '08second' '09' '0900' '095400' '099' '0bama' '0bamas' '0for4' '0for5'
 '1' '10' '100' '1000' '10000' '100000' '1000000' '10000000000000000000'
 '100000aday' '10000acre' '10000aplate' '10000foot' '10000km' '10000m'
 '10000man' '10000name' '10000person' '10000squarefoot' '10000th'
 '10000ye

In [77]:
print(X_train_tfidf.shape)
print(X_train_tfidf)

(18765, 107168)
  (0, 67928)	0.04690778335524813
  (0, 21249)	0.1333386348980611
  (0, 43194)	0.08310713344794932
  (0, 32006)	0.10953515566893744
  (0, 8163)	0.02869976456157917
  (0, 98456)	0.08821041617924698
  (0, 104089)	0.040846358021028074
  (0, 34746)	0.05842618131561038
  (0, 68440)	0.04632865702469982
  (0, 88085)	0.09197556341767808
  (0, 27208)	0.14193047969217695
  (0, 5184)	0.017927628630699844
  (0, 104922)	0.024296311553436957
  (0, 56934)	0.09661311129422885
  (0, 87213)	0.07548098878276675
  (0, 69549)	0.03238846408598244
  (0, 76474)	0.16859099801900168
  (0, 102404)	0.025875827153799776
  (0, 96215)	0.03193805789665119
  (0, 75318)	0.04892331870205253
  (0, 78776)	0.07397951229643021
  (0, 7251)	0.03172672066517962
  (0, 95155)	0.04390861784696202
  (0, 86441)	0.10020673063337579
  (0, 43945)	0.06339187505337111
  :	:
  (18764, 105297)	0.04112567606935367
  (18764, 37608)	0.05238050553398173
  (18764, 104260)	0.023713954774334255
  (18764, 50475)	0.09118795726712614

In [78]:
clf = LogisticRegression(random_state=0, max_iter=200).fit(X_train_tfidf, y_train)
y_pred = clf.predict(X_train_tfidf)
print(f"Accuracy: {accuracy_score(y_train, y_pred)}")
print(f"F1 score: {f1_score(y_train, y_pred, average='macro')}")

Accuracy: 0.9897148947508659
F1 score: 0.9883552691098243


## Testing

In [79]:
X_test_tfidf = tfidf_vectorizer.transform(X_test)
y_pred = clf.predict(X_test_tfidf)
print(f"Accuracy: {accuracy_score(y_test, y_pred)}")
print(f"F1 score: {f1_score(y_test, y_pred, average='macro')}")

Accuracy: 0.9908872901678657
F1 score: 0.9896919824361684


## Validation

In [80]:
# Label 1
satire_sentence = "If voting changed anything, they would make it illegal."		

# Label 2
hoax_sentence = "In a recent turn of events, Obama has declared that he will be joining the Republican Party, parterning with Donald Trump."	

X_val = [satire_sentence, hoax_sentence]
X_test_tfidf = tfidf_vectorizer.transform(X_val)
y_pred_val = clf.predict(X_test_tfidf)
y_pred_val


array([1, 2])