<a href="https://colab.research.google.com/github/felixsimard/comp551-p2/blob/main/P2_TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Part 2: Text Classification (20 points)**

In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import joblib
import re
from sklearn.feature_extraction.stop_words import ENGLISH_STOP_WORDS
from nltk.stem.snowball import SnowballStemmer

from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize
import nltk
nltk.download('punkt')

# Constants

contractions_dict = {"ain't": "are not","'s":" is","aren't": "are not"}
# Regular expression for finding contractions
contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))



[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!


In [2]:
# Define dataset paths
fake_new_train_dir = r'fake_news/fake_news_train.csv'
fake_news_val_dir = r'fake_news/fake_news_val.csv'
fake_news_test_dir = r'fake_news/fake_news_test.csv'

# Load datasets
fake_news_train = pd.read_csv(fake_new_train_dir, engine="python", error_bad_lines=False)
fake_news_val = pd.read_csv(fake_news_val_dir, engine="python", error_bad_lines=False)
fake_news_test = pd.read_csv(fake_news_test_dir, engine="python", error_bad_lines=False)

fake_news_train

Unnamed: 0,text,label
0,Indian fruit is so important to so many people...,0
1,"FORT WORTH, Texas — Urú Inc. will hold a confe...",0
2,"With three of the four new carriers, the Niger...",0
3,Let's start with the classic annual dividend r...,0
4,Following are some of the major events to have...,1
...,...,...
19995,"Warning: small, petty spoilers for the Game of...",1
19996,Shilpa Shetty will soon make her Bollywood deb...,0
19997,Add a digital black hole image to the Allstate...,0
19998,Share\nThe name W. L. Gore & Associates might ...,1


#### Preprocessing

In [9]:
# Stemmer function
def apply_stemmer(text):
    # stemmer= PorterStemmer()
    stemmer = SnowballStemmer("english")
    input_str = word_tokenize(text)
    stem_lst = []
    for word in input_str:
        word_stem = stemmer.stem(word)
        stem_lst.append(word_stem)
    
    stem_str = " ".join(stem_lst)
    return stem_str

# Expanding contractions
# Reference: https://www.analyticsvidhya.com/blog/2021/06/must-known-techniques-for-text-preprocessing-in-nlp/
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Preprocessing function
def preprocess(df):

    # Lowercase everything
    df['text'] = df['text'].str.lower()

    # Expand contractions
    # Reference: https://www.analyticsvidhya.com/blog/2021/06/must-known-techniques-for-text-preprocessing-in-nlp/
    df['text'] = df['text'].apply(lambda x : expand_contractions(x))

    # Remove ponctuation
    df['text'] = df['text'].replace(r'[^\w\s]', r'', regex=True)

    # Remove numbers
    df['text'] = df['text'].replace(r'\d', r'', regex=True)

    # Remove special characters (eg: \n)
    df['text'] = df['text'].replace(r'\\[a-z]', r'', regex=True)

    # Stemming
    # df['text'] = df.apply(lambda row: apply_stemmer(row['text']), axis=1)

    # Trim whitespaces
    df['text'] = df['text'].str.strip()

    # Bag of words + Tokenization
    vect = CountVectorizer(stop_words=ENGLISH_STOP_WORDS)
    X_train_counts = vect.fit_transform(df.text)
    joblib.dump(vect, "vectorizer.pkl")
    print("Vectorizer vocabulary:", vect.vocabulary_.get(u'algorithm'))
    print("Count Vectorizer shape:", X_train_counts.shape)
    

    # TF-IDF representation using bag-of-words matrix
    tfidf_transform = TfidfTransformer()
    X_train_tfidf = tfidf_transform.fit_transform(X_train_counts)
    joblib.dump(tfidf_transform, "tfidf_transform.pkl")
    print("TF-IDF shape:", X_train_tfidf.shape)
    print("\n")

    return X_train_tfidf, df

fake_news_train_preprocessed_tfidf, fake_news_train_preprocessed = preprocess(fake_news_train)
fake_news_train_preprocessed

Vectorizer vocabulary: 4050
Count Vectorizer shape: (20000, 175918)
TF-IDF shape: (20000, 175918)




Unnamed: 0,text,label
0,indian fruit is so important to so many people...,0
1,fort worth texas urú inc will hold a conferen...,0
2,with three of the four new carriers the nigeri...,0
3,let is start with the classic annual dividend ...,0
4,following are some of the major events to have...,1
...,...,...
19995,warning small petty spoilers for the game of t...,1
19996,shilpa shetty will soon make her bollywood deb...,0
19997,add a digital black hole image to the allstate...,0
19998,share\nthe name w l gore associates might not...,1


In [10]:
fake_news_train_preprocessed.iloc[0]['text']

'indian fruit is so important to so many people now that the season has come to an end the first stop is to pick something fresh and tangy with some sugar\nchopped papaya and apple is a little more complicated and some good fats come in other sugar as well milk and almonds are two great sources of fats as well as a couple of fruits such as avocados and okra the result is such a healthy delicious appletoberry spread\neasy yummy and delicious\n oz apple or applelivered bon bons\n tbsp sugar\n tsp almond extract\nfreshly ground black pepper\n tbsp rice vinegar\n\n thick and lightflavored bon bons\nseason with salt and pepper to taste and vary thickness\nto finish add the fruit to the bowl of a small mixer fitted with a large paddle attachment line a bowl with nonstick cooking spray if using spatula for these meals add enough sugar and almond extract to shake off any excess\nwhen the mixture is thick and solid remove the bowl from the mixer with a slotted spoon and mix again\nworking with 

#### Model Training, Validation, Testing

In [11]:
def train_model(x_train, y_train):
    # preprocess x_train
    x_train_preprocessed_tfidf, x_train_preprocessed  = preprocess(x_train)

    # fit model
    model = LogisticRegression().fit(x_train_preprocessed_tfidf, y_train)
    training_score = model.score(x_train_preprocessed_tfidf, y_train)
    joblib.dump(train_model, "model.pkl")

    print("Training score:", training_score)

    return model

def transform_preprocess(x_test):
    # Load saved pickles
    loaded_vectorizer = joblib.load("vectorizer.pkl")
    loaded_tfidf_transform = joblib.load("tfidf_transform.pkl")
    loaded_model = joblib.load("model.pkl")

    # Transform
    x_val_vec = loaded_vectorizer.transform(x_test)
    x_val_tfidf = loaded_tfidf_transform.transform(x_val_vec)

    return x_val_tfidf


In [12]:
# Train model
trained_model = train_model(fake_news_train, fake_news_train['label'])

Vectorizer vocabulary: 4050
Count Vectorizer shape: (20000, 175918)
TF-IDF shape: (20000, 175918)


Training score: 0.85335


In [13]:
# Test set
x_test_tfidf = transform_preprocess(fake_news_test['text'])
y_predictions = trained_model.predict(x_test_tfidf)

acc_score = accuracy_score(fake_news_test['label'], y_predictions)
print("Accuracy on test set:", acc_score)

Accuracy on test set: 0.6746666666666666
