<a href="https://colab.research.google.com/github/felixsimard/comp551-p2/blob/main/P2_TextClassification.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## **Part 2: Text Classification (20 points)**

In [85]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer
import joblib

# Constants

In [86]:
# Define dataset paths
fake_new_train_dir = r'fake_news/fake_news_train.csv'
fake_news_val_dir = r'fake_news/fake_news_val.csv'
fake_news_test_dir = r'fake_news/fake_news_test.csv'

# Load datasets
fake_news_train = pd.read_csv(fake_new_train_dir, engine="python", error_bad_lines=False)
fake_news_val = pd.read_csv(fake_news_val_dir, engine="python", error_bad_lines=False)
fake_news_test = pd.read_csv(fake_news_test_dir, engine="python", error_bad_lines=False)

fake_news_train

Skipping line 3994: unexpected end of data


Unnamed: 0,text,label
0,Indian fruit is so important to so many people...,0
1,"FORT WORTH, Texas — Urú Inc. will hold a confe...",0
2,"With three of the four new carriers, the Niger...",0
3,Let's start with the classic annual dividend r...,0
4,Following are some of the major events to have...,1
...,...,...
3987,"sport, local-sport,\nMelbourne Victory is stre...",1
3988,LIONEL MESSI is still struggling after being h...,1
3989,"FRANKFURT, Finland — Finland’s economy has gro...",0
3990,Vermont health officials say two coyotes have ...,1


#### Preprocessing

In [87]:
# Make a preprocessing function
def preprocess(df):

    # Lowercase everything
    df['text'] = df['text'].str.lower()

    # Remove ponctuation
    df['text'] = df['text'].replace(r'[^\w\s]', r'', regex=True)

    # Trim whitespaces
    df['text'] = df['text'].str.strip()

    # Bag of words + Tokenization
    vect = CountVectorizer()
    X_train_counts = vect.fit_transform(df.text)
    joblib.dump(vect, "vectorizer.pkl")
    print("Vectorizer vocabulary:", vect.vocabulary_.get(u'algorithm'))
    print("Count Vectorizer shape:", X_train_counts.shape)
    

    # TF-IDF repsenation using bag-of-words matrix
    tfidf_transform = TfidfTransformer()
    X_train_tfidf = tfidf_transform.fit_transform(X_train_counts)
    joblib.dump(tfidf_transform, "tfidf_transform.pkl")
    print("TF-IDF shape:", X_train_tfidf.shape)
    print("\n")

    return X_train_tfidf



#### Model Training, Validation, Testing

In [88]:
def train_model(x_train, y_train):
    # preprocess x_train
    x_train_preprocessed = preprocess(x_train)

    # fit model
    model = LogisticRegression().fit(x_train_preprocessed, y_train)
    training_score = model.score(x_train_preprocessed, y_train)
    joblib.dump(train_model, "model.pkl")

    print("Training score:", score)

    return model

def transform_preprocess(x_test):
    # Load saved pickles
    loaded_vectorizer = joblib.load("vectorizer.pkl")
    loaded_tfidf_transform = joblib.load("tfidf_transform.pkl")
    loaded_model = joblib.load("model.pkl")

    # Transform
    x_val_vec = loaded_vectorizer.transform(x_test)
    x_val_tfidf = loaded_tfidf_transform.transform(x_val_vec)

    return x_val_tfidf


In [89]:
# Train model
trained_model = train_model(fake_news_train, fake_news_train['label'])

# Test set
x_test_tfidf = transform_preprocess(fake_news_test['text'])
y_predictions = trained_model.predict(x_test_tfidf)

acc_score = accuracy_score(fake_news_test['label'], y_predictions)
print("Accuracy on test set:", acc_score)


Vectorizer vocabulary: 6941
Count Vectorizer shape: (3992, 79057)
TF-IDF shape: (3992, 79057)


Training score: 0.8469438877755511
Accuracy on test set: 0.6316666666666667
