<a href="https://colab.research.google.com/github/felixsimard/comp551-p2/blob/main/P2_TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Part 2: Text Classification (20 points)**

In [7]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import joblib
import re
from nltk.corpus import stopwords
from nltk.stem import PorterStemmer
from nltk.tokenize import word_tokenize        
from nltk.stem import WordNetLemmatizer 
import nltk
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')

# Constants

contractions_dict = {"ain't": "are not","'s":" is","aren't": "are not"}
# Regular expression for finding contractions
contractions_re = re.compile('(%s)' % '|'.join(contractions_dict.keys()))

stop_words = set(stopwords.words('english'))
stop_words.add('subject')
stop_words.add('http')

[nltk_data] Downloading package punkt to /root/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to /root/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to /root/nltk_data...
[nltk_data]   Unzipping corpora/wordnet.zip.


In [8]:
# Define dataset paths
fake_new_train_dir = r'fake_news/fake_news_train.csv'
fake_news_val_dir = r'fake_news/fake_news_val.csv'
fake_news_test_dir = r'fake_news/fake_news_test.csv'

# Load datasets
fake_news_train = pd.read_csv(fake_new_train_dir, engine="python", error_bad_lines=False)
fake_news_val = pd.read_csv(fake_news_val_dir, engine="python", error_bad_lines=False)
fake_news_test = pd.read_csv(fake_news_test_dir, engine="python", error_bad_lines=False)

fake_news_train

Skipping line 5337: unexpected end of data


Unnamed: 0,text,label
0,Indian fruit is so important to so many people...,0
1,"FORT WORTH, Texas — Urú Inc. will hold a confe...",0
2,"With three of the four new carriers, the Niger...",0
3,Let's start with the classic annual dividend r...,0
4,Following are some of the major events to have...,1
...,...,...
5330,With the return of Game of Thrones also comes ...,1
5331,Waller-Bridge is heading the next installment ...,0
5332,"Share\nA number of retailers are offering $50,...",0
5333,"TEHRAN — Around 4,000 couples attended 34 such...",0


#### Preprocessing

In [9]:
# Stemmer function
def apply_stemmer(text):
    stemmer= PorterStemmer()
    return " ".join([stemmer.stem(word) for word in text.split()])

# Expanding contractions
# Reference: https://www.analyticsvidhya.com/blog/2021/06/must-known-techniques-for-text-preprocessing-in-nlp/
def expand_contractions(text,contractions_dict=contractions_dict):
    def replace(match):
        return contractions_dict[match.group(0)]
    return contractions_re.sub(replace, text)

# Remove stop-words
# Reference: https://www.analyticsvidhya.com/blog/2021/06/must-known-techniques-for-text-preprocessing-in-nlp/
def remove_stopwords(text):
    return " ".join([word for word in str(text).split() if word not in stop_words])

class LemmaTokenizer(object):
    def __init__(self):
        self.wnl = WordNetLemmatizer()
    def __call__(self, articles):
        return [self.wnl.lemmatize(t) for t in word_tokenize(articles)]
        
# Preprocessing function
def preprocess(df):

    # Lowercase everything
    df['text'] = df['text'].str.lower()

    # Expand contractions
    # Reference: https://www.analyticsvidhya.com/blog/2021/06/must-known-techniques-for-text-preprocessing-in-nlp/
    df['text'] = df['text'].apply(lambda x : expand_contractions(x))

    # Remove ponctuation
    df['text'] = df['text'].replace(r'[^\w\s]', r'', regex=True)

    # Remove numbers
    df['text'] = df['text'].replace(r'\d', r'', regex=True)

    # Remove special characters (eg: \n)
    df['text'] = df['text'].replace(r'\\[a-z]', r'', regex=True)

    # Remove stop words
    df['text'] = df['text'].apply(lambda x : remove_stopwords(x))

    # Stemming
    # df["text"] = df["text"].apply(lambda x: apply_stemmer(x))

    # Trim whitespaces
    df['text'] = df['text'].str.strip()

    # Word level count vectorization
    vect = CountVectorizer(tokenizer=LemmaTokenizer(), analyzer='word', token_pattern=r'\w{1,}', min_df=5, max_df=0.5, stop_words='english', max_features=5000, binary=True)
    X_train_counts = vect.fit_transform(df.text)
    joblib.dump(vect, "vectorizer.pkl")
    print("Vectorizer vocabulary:", vect.vocabulary_.get(u'algorithm'))
    print("Count Vectorizer shape:", X_train_counts.shape)
    

    # TF-IDF representation using bag-of-words matrix
    tfidf_transform = TfidfTransformer()
    X_train_tfidf = tfidf_transform.fit_transform(X_train_counts)
    joblib.dump(tfidf_transform, "tfidf_transform.pkl")
    print("TF-IDF shape:", X_train_tfidf.shape)
    print("\n")

    return X_train_tfidf, df

# fake_news_train_preprocessed_tfidf, fake_news_train_preprocessed = preprocess(fake_news_train)
# fake_news_train_preprocessed

In [10]:
# fake_news_train_preprocessed.iloc[0]['text']

#### Model Training, Validation, Testing

In [11]:
def train_model(x_train, y_train):
    # preprocess x_train
    x_train_preprocessed_tfidf, x_train_preprocessed  = preprocess(x_train)

    # fit model
    model = LogisticRegression(C=5, fit_intercept=False, solver='liblinear', multi_class='ovr').fit(x_train_preprocessed_tfidf, y_train)
    training_score = model.score(x_train_preprocessed_tfidf, y_train)
    joblib.dump(train_model, "model.pkl")

    print("Training score:", training_score)

    return model

def transform_preprocess(x_test):
    # Load saved pickles
    loaded_vectorizer = joblib.load("vectorizer.pkl")
    loaded_tfidf_transform = joblib.load("tfidf_transform.pkl")
    loaded_model = joblib.load("model.pkl")

    # Transform
    x_val_vec = loaded_vectorizer.transform(x_test)
    x_val_tfidf = loaded_tfidf_transform.transform(x_val_vec)

    return x_val_tfidf


In [12]:
# Train model
trained_model = train_model(fake_news_train, fake_news_train['label'])

  'stop_words.' % sorted(inconsistent))


Vectorizer vocabulary: 149
Count Vectorizer shape: (5335, 5000)
TF-IDF shape: (5335, 5000)


Training score: 0.9439550140581069


In [13]:
# Test set
x_test_tfidf = transform_preprocess(fake_news_test['text'])
y_predictions = trained_model.predict(x_test_tfidf)

acc_score = accuracy_score(fake_news_test['label'], y_predictions)
print("Accuracy on test set:", acc_score)

  'stop_words.' % sorted(inconsistent))


Accuracy on test set: 0.6426666666666667
