In [1]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

import seaborn as ss

import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
from nltk.corpus import stopwords, words

# Ensure required NLTK resources are downloaded
try:
    nltk.download('punkt')
    nltk.download('stopwords')
    nltk.download('words')

except Exception as e:
    print(f"Error downloading NLTK resources: {e}")

# Define stopwords list
specific_stopwords = ["https", "subreddit", "www", "com"] ## some specific words for the given dataset
stopwords_list = stopwords.words('english') +specific_stopwords + stopwords.words('french') # dataset is both in english and in french


[nltk_data] Downloading package punkt to /home/clatimie/nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     /home/clatimie/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package words to /home/clatimie/nltk_data...
[nltk_data]   Package words is already up-to-date!


In [2]:
# Define the path to the training data file
path_training = "../datasets/Train.csv"

# Read the CSV file into a pandas DataFrame
training_data = pd.read_csv(path_training, delimiter=',')

# Set column names explicitly for better readability
training_data.columns = ['text', 'subreddit']

# Shuffle dataset
training_data = training_data.sample(frac=1, random_state=42).reset_index(drop=True) 

# Separate the training data into two series: texts and subreddit labels
x_train = training_data['text']          # Contains the Reddit posts or comments
y_train = training_data['subreddit'] # Contains the subreddit each post originates from

# Get unique subreddit labels
unique_labels = np.unique(y_train)   # List of unique subreddits in the dataset

n_samples_training = x_train.shape[0]
n_classes = unique_labels.shape[0]

print(f"Training dataset has {n_samples_training} examples and there are {n_classes} classes")

Training dataset has 1399 examples and there are 4 classes


In [3]:
# Define the path to the training data file
path_test = "../datasets/Test.csv"

# Read the CSV file into a pandas DataFrame
x_test = pd.read_csv(path_test, delimiter=',')["body"]

n_samples_test = x_test.shape[0]
print(f"Test dataset has {n_samples_test} examples")


Test dataset has 600 examples


In [4]:
class LemmaTokenizer:
    def __init__(self, stopwords=None):
        self.wnl = WordNetLemmatizer()
        self.stop_words = stopwords

    def __call__(self, doc):
        # Tokenize the document and apply lemmatization and filtering
        return [
            self.wnl.lemmatize(t, pos="v") for t in word_tokenize(doc)
            if t.isalpha() and t.lower() not in self.stop_words]

# SVC - Two classifiers method #

In [5]:
from sklearn.svm import SVC
from sklearn.model_selection import KFold
from sklearn.metrics import accuracy_score, classification_report
if False:
    #################################################################### Classifier 1 : Montreal or Not ######################################################################
    # Best parameters found : max_features = 1000, svm_C = 10, kernel="rbf", svm_gamma = 'scale'

    # Convert y_train to binary labels for the first classifier
    y_binary = np.array([1 if label == "Montreal" else 0 for label in y_train])

    TfidfVectorizer1 = TfidfVectorizer(
        lowercase=True,
        max_features=1000,
        tokenizer=LemmaTokenizer(stopwords_list)
    )

    x_tfidf1 = TfidfVectorizer1.fit_transform(x_train)
    scaler1 = StandardScaler()
    x_scaled_1 = scaler1.fit_transform(np.asarray(x_tfidf1.todense()))

    binary_svm = SVC(C=10.0, kernel="rbf", gamma="scale")

    # 10 Fold cross validation
    kf = KFold(n_splits=10, shuffle=True, random_state=42)
    accuracies = []

    for train_index, val_index in kf.split(x_scaled_1):
        X_train_fold, X_val_fold = x_scaled_1[train_index], x_scaled_1[val_index]
        y_fold_train, y_fold_val = y_binary[train_index], y_binary[val_index]
        
        # Train best binary SVM classifier
        binary_svm.fit(X_train_fold, y_fold_train)
        
        # Predict and evaluate on validation set
        y_pred = binary_svm.predict(X_val_fold)
        accuracy = accuracy_score(y_fold_val, y_pred)
        accuracies.append(accuracy)
        
        # Display results for each fold
        print(f"Accuracy for fold: {accuracy:.4f}")
        print("Classification Report:\n", classification_report(y_fold_val, y_pred))

    # Calculate and display mean accuracy
    mean_accuracy = np.mean(accuracies)
    print(f"Mean Accuracy across 10 folds for best binary classifier: {mean_accuracy:.4f}")

    #################################################################### Classifier 2 : Other subreddits ######################################################################

    # Best parameters for multi-class SVM: {'svm__C': 0.1, 'svm__gamma': 'scale', 'svm__kernel': 'linear', 'tfidf__max_features': 4000}

    # Filter for "Not Montreal" entries for the second classifier
    x_not_montreal = np.array(x_train)[y_binary == 0]
    y_not_montreal = np.array(y_train)[y_binary == 0]

    TfidfVectorizer2 = TfidfVectorizer(
        lowercase=True,
        max_features=4000,
        stop_words=stopwords_list
    )
    x_tfidf2 = TfidfVectorizer2.fit_transform(x_not_montreal)
    scaler2 = StandardScaler()
    x_scaled_2 = scaler2.fit_transform(np.asarray(x_tfidf2.todense()))

    multi_class_svm = SVC(C=0.1, kernel="linear", gamma="scale")

    accuracies = []

    for train_index, val_index in kf.split(x_scaled_2):
        X_train_fold, X_val_fold = x_scaled_2[train_index], x_scaled_2[val_index]
        y_fold_train, y_fold_val = y_not_montreal[train_index], y_not_montreal[val_index]
        
        # Train best multi-class SVM on non-Montreal data
        multi_class_svm.fit(X_train_fold, y_fold_train)
        
        # Predict and evaluate on validation set
        y_pred = multi_class_svm.predict(X_val_fold)
        accuracy = accuracy_score(y_fold_val, y_pred)
        accuracies.append(accuracy)
        
        # Display results for each fold
        print(f"Accuracy for fold: {accuracy:.4f}")
        print("Classification Report:\n", classification_report(y_fold_val, y_pred))

    # Calculate and display mean accuracy for the multi-class classifier
    mean_accuracy = np.mean(accuracies)
    print(f"Mean Accuracy across 10 folds for best multi-class classifier: {mean_accuracy:.4f}")

    ################################################################################ Fit the two classifiers with the whole dataset #########################################
    binary_svm.fit(x_scaled_1, y_binary)
    multi_class_svm.fit(x_scaled_2, y_not_montreal)

    ######################################################################### Save predictions in a csv file ##############################################################
    def predict_subreddits_SVC(x_test, binary_model, multi_class_model, vectorizer_binary, vectorizer_multiclass, scaler_binary, scaler_multiclass):
        """
        Predicts the location for a given array of texts using a binary and multi-class model.
        Saves predictions in a CSV file with 'id' and 'subreddit' columns.

        Parameters:
        - x_test (array-like): Array of text inputs to predict.
        - binary_model (sklearn model): Fitted binary SVM model.
        - multi_class_model (sklearn model): Fitted multi-class SVM model.
        - vectorizer_binary (TfidfVectorizer): Fitted TF-IDF vectorizer for binary model.
        - vectorizer_multiclass (TfidfVectorizer): Fitted TF-IDF vectorizer for multi-class model.
        - scaler_binary (StandardScaler): Fitted scaler for binary model.
        - scaler_multiclass (StandardScaler): Fitted scaler for multi-class model.

        Returns:
        - DataFrame: Prediction results with 'id' and 'subreddit' columns.
        """
        # Step 1: Transform texts using the vectorizer and scaler for binary model
        x_test_tfidf1 = vectorizer_binary.transform(x_test)
        x_test_scaled_1 = scaler_binary.transform(np.asarray(x_test_tfidf1.todense()))
        
        # Step 2: Predict with the binary model to identify "Montreal" vs "Not Montreal"
        binary_predictions = binary_model.predict(x_test_scaled_1)
        
        # Step 3: Prepare to apply multi-class model only to "Not Montreal" entries
        not_montreal_mask = (binary_predictions == 0)  # Mask for "Not Montreal" entries
        x_test_not_mtl = x_test[not_montreal_mask]

        # Step 4: Transform "Not Montreal" texts using multi-class vectorizer
        x_test_tfidf2 = vectorizer_multiclass.transform(x_test_not_mtl)
        x_test_scaled_2 = scaler_multiclass.transform(np.asarray(x_test_tfidf2.todense()))
        
        # Apply multi-class model only to the "Not Montreal" subset
        multi_class_predictions = multi_class_model.predict(x_test_scaled_2)
        
        # Step 5: Combine predictions for both "Montreal" and multi-class predictions
        predictions = []
        
        # Add "Montreal" predictions
        for pred in binary_predictions:
            if pred == 1:
                predictions.append("Montreal")
            else:
                predictions.append(None)  # Placeholder for "Not Montreal" predictions
                
        # Replace None with multi-class predictions for "Not Montreal" entries
        not_montreal_indices = [i for i, pred in enumerate(binary_predictions) if pred == 0]
        for idx, multi_pred in zip(not_montreal_indices, multi_class_predictions):
            predictions[idx] = multi_pred
        
        # Step 6: Create DataFrame for results
        results_df = pd.DataFrame({
            'id': range(len(x_test)),
            'subreddit': predictions
        })
        
        # Step 7: Save to CSV
        results_df.to_csv("../output/submissions_two_layer_svm.csv", index=False)
        
        return results_df


    predict_subreddits_SVC(x_test, binary_svm, multi_class_svm, TfidfVectorizer1, TfidfVectorizer2, scaler1, scaler2)



# MLP #

In [20]:
# Build the network
from tensorflow.keras import models
from tensorflow.keras import layers
from tensorflow.keras.layers import BatchNormalization
from tensorflow.keras import initializers
import numpy as np
import tensorflow as tf
from tensorflow.keras.utils import to_categorical
from tensorflow.keras import regularizers

# Set random seed
np.random.seed(0)

def sigmoid(z):
    return 1 / (1 + tf.exp(-z))

def classifier_model(layer_size, num_layers, activation, input_dim, dropout_rate=0.7):
    network = models.Sequential()

    # Add first layer
    network.add(layers.Dense(layer_size, 
                             input_dim=input_dim, 
                             activation=activation, 
                             kernel_initializer=initializers.RandomNormal(stddev=0.01),
                             kernel_regularizer=regularizers.l2(0.01)))

    for _ in range(num_layers):
        network.add(layers.Dense(layer_size, 
                                 activation=activation, 
                                 kernel_initializer=initializers.RandomNormal(stddev=0.01),
                                 kernel_regularizer=regularizers.l2(0.01)))
        network.add(BatchNormalization())
        network.add(layers.Dropout(dropout_rate))
    network.add(layers.Dense(4, activation='softmax')) # Add the output layer

    # Compile the network
    network.compile(optimizer='rmsprop',
                    loss='categorical_crossentropy',
                    metrics=['accuracy'])
    
    return network

model_1 = classifier_model(
    layer_size=200,
    num_layers=5, 
    input_dim=3000,
    activation=sigmoid,
    )

  super().__init__(activity_regularizer=activity_regularizer, **kwargs)


In [None]:
from sklearn.preprocessing import LabelEncoder
from tensorflow.keras.utils import to_categorical

# Assuming y_train contains string labels like 'Montreal', 'Toronto', etc.
# Step 1: Label encoding
label_encoder = LabelEncoder()
y_train_encoded = label_encoder.fit_transform(y_train)  # Converts string labels to integers

# Step 2: One-hot encoding
y_train_cat = to_categorical(y_train_encoded, num_classes=4)  # Assuming 4 classes

# Prepare the TfidfVectorizer and apply it to the training set
TfidfVectorizer_nn = TfidfVectorizer(
    lowercase=True,
    max_features=3000,
    tokenizer=LemmaTokenizer(stopwords=stopwords_list)  # Your LemmaTokenizer
)

# Transform the text data to the TF-IDF matrix
x_tfidf_train_nn = TfidfVectorizer_nn.fit_transform(x_train)  # Assuming x_train is your training text
scaler_nn = StandardScaler()
x_scaled_train_nn = scaler_nn.fit_transform(np.asarray(x_tfidf_train_nn.todense()))  # Scale the data

# Proceed with k-Fold cross-validation
kf = KFold(n_splits=10, shuffle=True, random_state=42)

accuracies = []  # To store accuracies for each fold

for train_index, val_index in kf.split(x_scaled_train_nn):
    # Split data into training and validation sets for this fold
    X_train_fold, X_val_fold = x_scaled_train_nn[train_index], x_scaled_train_nn[val_index]
    y_train_fold, y_val_fold = y_train_cat[train_index], y_train_cat[val_index]
    

    # Train the model on the training fold
    model_1.fit(X_train_fold, y_train_fold, epochs=20, batch_size=32, verbose=1)  # Adjust epochs and batch size

    # Evaluate the model on the validation fold
    val_loss, val_accuracy = model_1.evaluate(X_val_fold, y_val_fold)
    accuracies.append(val_accuracy)

    print(f"Validation Accuracy for this fold: {val_accuracy:.4f}")

# Print mean accuracy across all folds
print(f"Mean Validation Accuracy: {np.mean(accuracies):.4f}")




Epoch 1/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m2s[0m 9ms/step - accuracy: 0.2620 - loss: 3.5082
Epoch 2/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.4216 - loss: 2.7844
Epoch 3/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.5576 - loss: 2.2899
Epoch 4/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.7703 - loss: 1.6791
Epoch 5/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8324 - loss: 1.4269
Epoch 6/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m1s[0m 9ms/step - accuracy: 0.8464 - loss: 1.3424 
Epoch 7/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 8ms/step - accuracy: 0.8226 - loss: 1.3617
Epoch 8/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m[37m[0m [1m0s[0m 9ms/step - accuracy: 0.8704 - loss: 1.1832
Epoch 9/10
[1m40/40[0m [32m━━━━━━━━━━━━━━━━━━━━[0m