In [None]:
import copy
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import torch
import torch.nn as nn
import torch.optim as optim
import torch.optim.lr_scheduler as lr_scheduler
import tqdm
from scipy import stats
from torch.utils.data import DataLoader, TensorDataset
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import LabelEncoder
from sklearn.preprocessing import StandardScaler
import tensorflow as tf
from sklearn.decomposition import PCA
from sklearn.model_selection import StratifiedShuffleSplit

In [None]:
from google.colab import drive
drive.mount('/content/drive')

Drive already mounted at /content/drive; to attempt to forcibly remount, call drive.mount("/content/drive", force_remount=True).


In [None]:
import os
BASE_DIR = '/content/drive/MyDrive/Criteo_Hackathon'
os.chdir(BASE_DIR)

###SAMPLING THE BULK DATASET

In [None]:
bulk_data = np.load("bulk_embeddings.npy")

labels = bulk_data[:, -1]

# Count the occurrences of each label
unique_labels, label_counts = np.unique(labels, return_counts=True)

# Print the counts for each label
for label, count in zip(unique_labels, label_counts):
    print("Label {}: {} occurrences".format(label, count))

Label 0.0: 95 occurrences
Label 1.0: 193 occurrences
Label 2.0: 51 occurrences
Label 3.0: 65 occurrences
Label 4.0: 27 occurrences
Label 5.0: 39 occurrences
Label 6.0: 1 occurrences
Label 7.0: 532 occurrences
Label 8.0: 60 occurrences
Label 9.0: 1251 occurrences
Label 10.0: 308 occurrences
Label 11.0: 44 occurrences
Label 12.0: 332 occurrences
Label 13.0: 110 occurrences
Label 14.0: 19 occurrences
Label 15.0: 11 occurrences
Label 16.0: 1 occurrences
Label 17.0: 1 occurrences
Label 18.0: 1259 occurrences
Label 19.0: 621 occurrences
Label 20.0: 498 occurrences
Label 21.0: 1037 occurrences
Label 22.0: 416 occurrences
Label 23.0: 61 occurrences
Label 24.0: 156 occurrences
Label 25.0: 6 occurrences
Label 26.0: 117 occurrences
Label 28.0: 48 occurrences
Label 29.0: 377 occurrences
Label 30.0: 583 occurrences
Label 31.0: 242 occurrences
Label 32.0: 46 occurrences
Label 33.0: 33 occurrences
Label 34.0: 209 occurrences
Label 35.0: 234 occurrences
Label 36.0: 125 occurrences
Label 37.0: 20 occur

In [None]:
# Extract the last column (assumes the labels are in the last column)
labels = bulk_data[:, -1]

# Count the occurrences of each label
unique_labels, label_counts = np.unique(labels, return_counts=True)

# Initialize an empty list to store the sampled data
sampled_data = []

# Iterate over each label and sample data based on the label count
for label, count in zip(unique_labels, label_counts):
    label_data = bulk_data[labels == label]
    if count < 10:
        sample_size = count
    else:
        sample_size = int(0.10 * count)
    if sample_size > 0:
        sampled_indices = np.random.choice(len(label_data), size=sample_size, replace=False)
        sampled_data.append(label_data[sampled_indices])

# Concatenate the sampled data for all labels into a single array
if len(sampled_data) > 0:
    sampled_data = np.concatenate(sampled_data, axis=0)

# Check that all unique labels are present in the sampled data
assert np.all(np.isin(unique_labels, np.unique(sampled_data[:, -1])))

# Save the sampled data to a file
np.save("sampled_data.npy", sampled_data)

In this code, we iterate over each label and sample data based on the label count. If the count of a label is less than 10, we sample all available data points for that label. Otherwise, we sample 10% of the label count. We also check that the sample size is greater than 0 to avoid errors when sampling a label with very low count. Finally, we concatenate the sampled data for all labels, check that all unique labels are present, and save the sampled data to a file.

### SPLITING THE GOLDEN_EMBEDDINGS into 80/20 train/test

In [None]:
# Load the entire file
golden_data = np.load('golden_embeddings.npy')

# Split the file into 80% and 20% splits
split_index = int(0.8 * len(golden_data))
train_embedding = golden_data[:split_index]
test_embedding = golden_data[split_index:]

# Save the splits as separate .npy files
np.save('train_golden_embedding.npy', train_embedding)
np.save('test_golden_embedding.npy', test_embedding)

In [None]:
#separating the labels from golden_embeddings to do our own prediction
test_embeddings = np.load('test_golden_embedding.npy')

# Separate the labels from the embeddings
test_labels = test_embeddings[:, -1]
test_embeddings = test_embeddings[:, :-1]

# Save the embeddings and labels as separate files
np.save('test_golden_embeddings.npy', test_embeddings)
np.save('test_golden_labels.npy', test_labels)

###NN CLASSIFICATION MODEL ON BULK_SAMPLED

In [None]:
data = np.load('sampled_data.npy')

embeddings, labels = data[:, :-1], data[:, -1]

# Encode the labels as integers
label_encoder = LabelEncoder()
label_encoder.fit(labels)
encoded_labels = label_encoder.transform(labels)

# Convert data to PyTorch tensors
X = torch.from_numpy(embeddings).float()
y = torch.from_numpy(encoded_labels).long()

# Define the PyTorch dataset and dataloader
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

class Net(nn.Module):
    def __init__(self):
        super(Net, self).__init__()
        self.fc1 = nn.Linear(200, 256)
        self.fc2 = nn.Linear(256, 128)
        self.fc3 = nn.Linear(128, len(label_encoder.classes_))
        
    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = nn.functional.softmax(self.fc3(x), dim=1)
        return x

# Initialize the neural network
net = Net()

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()

# Define the optimizer and learning rate scheduler
optimizer = optim.RMSprop(net.parameters(), lr=0.001)
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5)

# Train the neural network
for epoch in range(10):
    running_loss = 0.0
    correct = 0
    total = 0
    for i, data in enumerate(dataloader):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Compute accuracy
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    # Step the scheduler
    scheduler.step(running_loss / len(dataloader))

    # Print the loss and accuracy of the current epoch
    print(f"Epoch {epoch + 1}, Loss: {running_loss / len(dataloader)} Accuracy: {100 * correct/total:.2f}%")

Epoch 1, Loss: 7.185574295583682 Accuracy: 75.19%
Epoch 2, Loss: 7.063888881303127 Accuracy: 87.44%
Epoch 3, Loss: 7.044586457635553 Accuracy: 89.37%
Epoch 4, Loss: 7.034479275770644 Accuracy: 90.40%
Epoch 5, Loss: 7.028433108976891 Accuracy: 91.00%
Epoch 6, Loss: 7.024092578257135 Accuracy: 91.44%
Epoch 7, Loss: 7.01953344418324 Accuracy: 91.90%
Epoch 8, Loss: 7.016191233195054 Accuracy: 92.25%
Epoch 9, Loss: 7.013811344122656 Accuracy: 92.48%
Epoch 10, Loss: 7.0121548541061856 Accuracy: 92.64%


###SAVE THE CODE

In [None]:
torch.save(net.state_dict(), 'pretrained_model.pth')

###TRANSFER LEARNING ON WHOLE GOLDEN DATASET

In [None]:
# Load the data
data = np.load('golden_embeddings.npy')

# Extract the embeddings and labels
embeddings, labels = data[:, :-1], data[:, -1]

# Encode the labels as integers
label_encoder = LabelEncoder()
label_encoder.fit(labels)
encoded_labels = label_encoder.transform(labels)

# Convert data to PyTorch tensors
X = torch.from_numpy(embeddings).float()
y = torch.from_numpy(encoded_labels).long()

# Define the PyTorch dataset and dataloader
dataset = TensorDataset(X, y)
dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

class NewNet(nn.Module):
    def __init__(self, pretrained_model, n_classes):
        super(NewNet, self).__init__()

        # Define the pretrained layers
        self.fc1 = nn.Linear(200, 256)
        self.fc2 = nn.Linear(256, 128)

        # Define the new layers
        self.fc3 = nn.Linear(128, n_classes)
#fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias', 'fc3.weight', 'fc3.bias'
        # Load the pretrained weights
        self.fc1.weight.data = pretrained_model['fc1.weight']
        self.fc1.bias.data = pretrained_model['fc1.bias']
        self.fc2.weight.data = pretrained_model['fc2.weight']
        self.fc2.bias.data = pretrained_model['fc2.bias']
        self.fc3.weight.data = pretrained_model['fc3.weight']
        self.fc3.bias.data = pretrained_model['fc3.bias']

    def forward(self, x):
        x = torch.tanh(self.fc1(x))
        x = torch.tanh(self.fc2(x))
        x = nn.functional.softmax(self.fc3(x), dim=1)
        return x


# Load the pretrained model
pretrained_state_dict = torch.load('pretrained_model.pth')

# Get the number of classes
n_classes = len(label_encoder.classes_)

# Initialize the new network
net = NewNet(pretrained_state_dict, n_classes)

# Freeze the parameters of the pretrained layers
for param in net.fc1.parameters():
    param.requires_grad = False
for param in net.fc2.parameters():
    param.requires_grad = False

# Define the loss function and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.RMSprop(net.fc3.parameters(), lr=0.001)

# Define the learning rate scheduler
scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5)

# Train the neural network
for epoch in range(10):
    running_loss = 0.0
    correct = 0
    total = 0
    for i, data in enumerate(dataloader):
        inputs, labels = data

        optimizer.zero_grad()

        outputs = net(inputs)
        loss = criterion(outputs, labels)
        loss.backward()
        optimizer.step()

        running_loss += loss.item()

        # Compute accuracy
        _, predicted = torch.max(outputs.data, 1)
        total += labels.size(0)
        correct += (predicted == labels).sum().item()

    # Step the scheduler
    scheduler.step(running_loss / len(dataloader))

    # Print the loss and accuracy of the current epoch
    print(f"Epoch {epoch + 1}, Loss: {running_loss / len(dataloader)} Accuracy: {100 * correct/total:.2f}%")

Epoch 1, Loss: 7.4528805486667205 Accuracy: 49.05%
Epoch 2, Loss: 7.2685827686678355 Accuracy: 67.57%
Epoch 3, Loss: 7.2474426427194265 Accuracy: 69.60%
Epoch 4, Loss: 7.23858619379453 Accuracy: 70.45%
Epoch 5, Loss: 7.232782872283352 Accuracy: 71.03%
Epoch 6, Loss: 7.228137534583305 Accuracy: 71.44%
Epoch 7, Loss: 7.224778463083505 Accuracy: 71.79%
Epoch 8, Loss: 7.222048855773419 Accuracy: 72.03%
Epoch 9, Loss: 7.219395687757672 Accuracy: 72.31%
Epoch 10, Loss: 7.217353874454582 Accuracy: 72.53%


### PREDICTING LABLES OF golden_dataset_without_labels

In [None]:
# Load the new embeddings
new_embeddings = np.load('golden_dataset_without_labels_part1.npy')

# Convert data to PyTorch tensor
new_features = torch.from_numpy(new_embeddings).float()

# Predict labels for the new embeddings
net.eval()
with torch.no_grad():
    outputs = net(new_features)
    _, predicted = torch.max(outputs.data, 1)

# Decode the predicted labels using the label encoder
predicted_labels_part1 = label_encoder.inverse_transform(predicted.numpy())

In [None]:
# Load the new embeddings
new_embeddings = np.load('golden_dataset_without_labels_part2.npy')

# Convert data to PyTorch tensor
new_features = torch.from_numpy(new_embeddings).float()

# Predict labels for the new embeddings
net.eval()
with torch.no_grad():
    outputs = net(new_features)
    _, predicted = torch.max(outputs.data, 1)

# Decode the predicted labels using the label encoder
predicted_labels_part2 = label_encoder.inverse_transform(predicted.numpy())

In [None]:
predicted_labels=np.concatenate((predicted_labels_part1, predicted_labels_part2), axis=0)

In [None]:
# Save the numpy array as a .npy file
np.save('predicted_labels.npy', predicted_labels)

In [None]:
# Load the numpy array from the .npy file
loaded_labels = np.load('predicted_labels.npy')

# Print the loaded numpy array
print(loaded_labels)

In [None]:
print(predicted_labels_part1)
print(predicted_labels_part2)
print(predicted_labels)

#### Divided the golden_dataset_without_labels.npy files into 2 parts beacuse it was too huge to process it at once. Then concatenated the PREDICTED LABELS of divided parts.

# TESTING

###TRANSFER LEARNING ON 80% TRAIN AND 20% TEST GOLDEN DATASET 

In [None]:
# # Load the data
# data = np.load('golden_embeddings.npy')

# # Extract the embeddings and labels
# embeddings, labels = data[:, :-1], data[:, -1]

# # Encode the labels as integers
# label_encoder = LabelEncoder()
# label_encoder.fit(labels)
# encoded_labels = label_encoder.transform(labels)

# # Split the data into train and test sets
# X_train, X_test, y_train, y_test = train_test_split(embeddings, encoded_labels, test_size=0.2, random_state=42)

# # Convert data to PyTorch tensors
# X_train = torch.from_numpy(X_train).float()
# y_train = torch.from_numpy(y_train).long()
# X_test = torch.from_numpy(X_test).float()
# y_test = torch.from_numpy(y_test).long()

# # Define the PyTorch dataset and dataloader for train set
# train_dataset = TensorDataset(X_train, y_train)
# train_dataloader = DataLoader(train_dataset, batch_size=32, shuffle=True)

# # Define the PyTorch dataset and dataloader for test set
# test_dataset = TensorDataset(X_test, y_test)
# test_dataloader = DataLoader(test_dataset, batch_size=32, shuffle=True)

# class NewNet(nn.Module):
#     def __init__(self, pretrained_model, n_classes):
#         super(NewNet, self).__init__()

#         # Define the pretrained layers
#         self.fc1 = nn.Linear(200, 256)
#         self.fc2 = nn.Linear(256, 128)

#         # Define the new layers
#         self.fc3 = nn.Linear(128, n_classes)

#         # Load the pretrained weights
#         self.fc1.weight.data = pretrained_model['fc1.weight']
#         self.fc1.bias.data = pretrained_model['fc1.bias']
#         self.fc2.weight.data = pretrained_model['fc2.weight']
#         self.fc2.bias.data = pretrained_model['fc2.bias']
#         self.fc3.weight.data = pretrained_model['fc3.weight']
#         self.fc3.bias.data = pretrained_model['fc3.bias']

#     def forward(self, x):
#         x = torch.tanh(self.fc1(x))
#         x = torch.tanh(self.fc2(x))
#         x = nn.functional.softmax(self.fc3(x), dim=1)
#         return x


# # Load the pretrained model
# pretrained_state_dict = torch.load('pretrained_model.pth')

# # Get the number of classes
# n_classes = len(label_encoder.classes_)

# # Initialize the new network
# net = NewNet(pretrained_state_dict, n_classes)

# # Freeze the parameters of the pretrained layers
# for param in net.fc1.parameters():
#     param.requires_grad = False
# for param in net.fc2.parameters():
#     param.requires_grad = False

# # Define the loss function and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.RMSprop(net.fc3.parameters(), lr=0.001)

# # Define the learning rate scheduler
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5)

# # Train the neural network
# for epoch in range(10):
#     train_loss = 0.0
#     train_correct = 0
#     train_total = 0
#     for i, data in enumerate(train_dataloader):
#         inputs, labels = data
#         optimizer.zero_grad()

#         outputs = net(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()

#         # Compute accuracy
#         _, predicted = torch.max(outputs.data, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()

# # Step the scheduler
# #scheduler.step(running_loss / len(train_loader))
# scheduler.step(running_loss / len(dataloader))

# # Print the loss and accuracy of the current epoch
# print(f"Epoch {epoch + 1}, Loss: {running_loss / len(train_dataloader)} Accuracy: {100 * correct/total:.2f}%")

# # Evaluate the model on the test set
# net.eval()
# test_loss = 0.0
# test_correct = 0
# test_total = 0
# with torch.no_grad():
#     for i, data in enumerate(test_dataloader):
#         inputs, labels = data

#         outputs = net(inputs)
#         loss = criterion(outputs, labels)

#         test_loss += loss.item()

#         # Compute accuracy
#         _, predicted = torch.max(outputs.data, 1)
#         test_total += labels.size(0)
#         test_correct += (predicted == labels).sum().item()

# # Print the test loss and accuracy of the current epoch
# print(f"Test Loss: {test_loss / len(test_dataloader)} Test Accuracy: {100 * test_correct/test_total:.2f}%")

###TRANSFER LEARNING ON 80% TRAIN GOLDEN DATASET

In [None]:
# # Load the data
# data = np.load('train_golden_embedding.npy')

# # Extract the embeddings and labels
# embeddings, labels = data[:, :-1], data[:, -1]

# # Encode the labels as integers
# label_encoder = LabelEncoder()
# label_encoder.fit(labels)
# encoded_labels = label_encoder.transform(labels)

# # Convert data to PyTorch tensors
# X = torch.from_numpy(embeddings).float()
# y = torch.from_numpy(encoded_labels).long()

# # Define the PyTorch dataset and dataloader
# dataset = TensorDataset(X, y)
# dataloader = DataLoader(dataset, batch_size=32, shuffle=True)

# class NewNet(nn.Module):
#     def __init__(self, pretrained_model, n_classes):
#         super(NewNet, self).__init__()

#         # Define the pretrained layers
#         self.fc1 = nn.Linear(200, 256)
#         self.fc2 = nn.Linear(256, 128)

#         # Define the new layers
#         self.fc3 = nn.Linear(128, n_classes)
# #fc1.weight', 'fc1.bias', 'fc2.weight', 'fc2.bias', 'fc3.weight', 'fc3.bias'
#         # Load the pretrained weights
#         self.fc1.weight.data = pretrained_model['fc1.weight']
#         self.fc1.bias.data = pretrained_model['fc1.bias']
#         self.fc2.weight.data = pretrained_model['fc2.weight']
#         self.fc2.bias.data = pretrained_model['fc2.bias']
#         self.fc3.weight.data = pretrained_model['fc3.weight']
#         self.fc3.bias.data = pretrained_model['fc3.bias']

#     def forward(self, x):
#         x = torch.tanh(self.fc1(x))
#         x = torch.tanh(self.fc2(x))
#         x = nn.functional.softmax(self.fc3(x), dim=1)
#         return x


# # Load the pretrained model
# pretrained_state_dict = torch.load('pretrained_model.pth')

# # Get the number of classes
# n_classes = len(label_encoder.classes_)

# # Initialize the new network
# net = NewNet(pretrained_state_dict, n_classes)

# # Freeze the parameters of the pretrained layers
# for param in net.fc1.parameters():
#     param.requires_grad = False
# for param in net.fc2.parameters():
#     param.requires_grad = False

# # Define the loss function and optimizer
# criterion = nn.CrossEntropyLoss()
# optimizer = optim.RMSprop(net.fc3.parameters(), lr=0.001)

# # Define the learning rate scheduler
# scheduler = optim.lr_scheduler.ReduceLROnPlateau(optimizer, factor=0.1, patience=5)

# # Train the neural network
# for epoch in range(10):
#     running_loss = 0.0
#     correct = 0
#     total = 0
#     for i, data in enumerate(dataloader):
#         inputs, labels = data

#         optimizer.zero_grad()

#         outputs = net(inputs)
#         loss = criterion(outputs, labels)
#         loss.backward()
#         optimizer.step()

#         running_loss += loss.item()

#         # Compute accuracy
#         _, predicted = torch.max(outputs.data, 1)
#         total += labels.size(0)
#         correct += (predicted == labels).sum().item()

#     # Step the scheduler
#     scheduler.step(running_loss / len(dataloader))

#     # Print the loss and accuracy of the current epoch
#     print(f"Epoch {epoch + 1}, Loss: {running_loss / len(dataloader)} Accuracy: {100 * correct/total:.2f}%")

###PREDICTING LABELS ON REMAINING 20% GOLDEN DATASET

In [None]:
# # Load the new embeddings
# new_embeddings = np.load('test_golden_embedding.npy')

# # Convert data to PyTorch tensor
# new_features = torch.from_numpy(new_embeddings).float()

# # Predict labels for the new embeddings
# net.eval()
# with torch.no_grad():
#     outputs = net(new_features)
#     _, predicted = torch.max(outputs.data, 1)

# # Decode the predicted labels using the label encoder
# predicted_labels = label_encoder.inverse_transform(predicted.numpy())

In [None]:
# actual_labels=np.load("test_golden_labels.npy")

### ACTUAL VS PREDICTED LABELS

In [None]:
# total_actual_labels_count = len(actual_labels)
# total_predicted_labels_count = len(predicted_labels)
# matched_labels_count = np.sum(predicted_labels == actual_labels)
# mismatched_labels_count = np.sum(predicted_labels != actual_labels)

# matched_labels_percent = matched_labels_count / total_actual_labels_count * 100

# print(f"Total actual label count: {total_actual_labels_count}")
# print(f"Total predicted label count: {total_predicted_labels_count}")
# print(f"Matched label count: {matched_labels_count}")
# print(f"Mismatched label count: {mismatched_labels_count}")
# print(f"Matched label percentage: {matched_labels_percent:.2f}%")