### CS 421 PROJECT

In [1]:
import numpy as np
import pandas as pd

# Load all previous weeks' data as training sets
batch1=np.load("first_batch.npz")
batch2=np.load("second_batch_with_labels.npz")
batch3=np.load("third_batch_with_labels.npz")

In [2]:
X1=pd.DataFrame(batch1["X"], columns=["user", "item", "rating"])
y1=pd.DataFrame(batch1["y"], columns=["user", "label"])
X2=pd.DataFrame(batch2["X"], columns=["user", "item", "rating"])
y2=pd.DataFrame(batch2["y"], columns=["user", "label"])
X3=pd.DataFrame(batch3["X"], columns=["user", "item", "rating"])
y3=pd.DataFrame(batch3["y"], columns=["user", "label"])

# Concatenate all previous weeks' batches into a single dataframe each
data_X = [X1, X2, X3]
data_y = [y1, y2, y3]
X = pd.concat(data_X)
y = pd.concat(data_y)

y.drop('user', axis=1, inplace=True)

In [22]:
def get_features(X: pd.DataFrame):
  features = pd.DataFrame(X.groupby('user')['user'].max())
  features['count'] = X.groupby('user')['rating'].count()
  features['std'] = X.groupby('user')['rating'].std()
  features['25%'] = X.groupby('user')['rating'].quantile(0.25)
  features['50%'] = X.groupby('user')['rating'].quantile(0.50)
  features['75%'] = X.groupby('user')['rating'].quantile(0.75)
  
  features['mean'] = X.groupby('user')['rating'].mean()
  features['median'] = X.groupby('user')['rating'].median()
  features['max'] = X.groupby('user')['rating'].max()
  features['min'] = X.groupby('user')['rating'].min()
  features['mode'] = X.groupby('user')['rating'].apply(lambda x: x.value_counts().index[0])
  return features

X_features = get_features(X)
X_features = X_features.fillna(0)



In [23]:
from sklearn.model_selection import train_test_split

# Anomaly detection methods
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.mixture import GaussianMixture

from sklearn.metrics import auc, roc_auc_score, precision_score, recall_score, f1_score

In [24]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

########### Isolation Forest ##########
IL=IsolationForest(n_estimators=500)
IL.fit(X_train)

########### Logistic Regression ##########
lr = LogisticRegression()
lr.fit(X_train, np.ravel(y_train, order='C'))
y_pred_lr = lr.predict_proba(X_test)[:,1]

########### Kmeans ##########
N=4 # 4 seems optimal (i tried 2 to 12 clusters)
kmeans = KMeans(n_clusters=N, init='k-means++', max_iter=50, random_state=0)
kmeans.fit(X_train)
scaler=[RobustScaler() for _ in range(N)]
distances_train, centres_train = kmeans.transform(X_train), kmeans.predict(X_train)
distnear_train = distances_train[range(centres_train.shape[0]), centres_train]
distances_test, centres_test = kmeans.transform(X_test), kmeans.predict(X_test)
distnear_test = distances_test[range(centres_test.shape[0]), centres_test]

for i in range(N):
    scaler[i].fit(distnear_train[centres_train==i].reshape(-1,1))
for i in range(N):
    distnear_test[centres_test==i] = scaler[i].transform(distnear_test[centres_test==i].reshape(-1,1)).reshape(-1)
    
########## Gaussian Mixture ##########
NN = 1
gm = GaussianMixture(n_components=NN, random_state=0, covariance_type="full").fit(X_train)
y_pred_gm = -gm.score_samples(X_test)
    
    
# Print AUC scores
print("######################### AUC SCORES ######################### ")
print("Isolation Forest:", roc_auc_score(y_test,-IL.score_samples(X_test)))
print("Logistic Regression:", roc_auc_score(y_test,y_pred_lr))
print("KMeans:", roc_auc_score(y_test, distnear_test))
print("Gaussian Mixture:", roc_auc_score(y_test, y_pred_gm))
print("##############################################################")


######################### AUC SCORES ######################### 
Isolation Forest: 0.6727926217279262
Logistic Regression: 0.9042306872565926
KMeans: 0.5310641142392125
Gaussian Mixture: 0.6878039565208968
##############################################################


In [25]:
# Load and predict on fourth batch
batch4 = np.load('fourth_batch_with_labels.npz')
X4 = pd.DataFrame(batch4['X'], columns=["user", "item", "rating"])
y4 = pd.DataFrame(batch4['y'], columns=["user", "label"])  
y4.drop("user", axis=1, inplace=True)

# Get features on 4th batch
X4_features = get_features(X4)

In [26]:
# Logistic Regression
lr.fit(X_features, np.ravel(y, order='C'))
y_pred_logreg = lr.predict(X4_features)
y_pred_proba = lr.predict_proba(X4_features)[::,1]


# Evaluation
print("------Logistic regression evaluation metrics------")
print(f"AUC: {roc_auc_score(y4, y_pred_proba)}")
print(f"F1: {f1_score(y4, y_pred_logreg)}")
print(f"Precision: {precision_score(y4, y_pred_logreg)}")
print(f"Recall: {recall_score(y4, y_pred_logreg)}")


------Logistic regression evaluation metrics------
AUC: 0.7724233333333332
F1: 0.5112262521588947
Precision: 0.5304659498207885
Recall: 0.49333333333333335


In [27]:
### Prediction on 4th batch 

########### Isolation Forest ##########
IL=IsolationForest(n_estimators=500)
IL.fit(X_features)

########### Logistic Regression ##########
lr = LogisticRegression()
lr.fit(X_features, np.ravel(y, order='C'))
y_pred_lr = lr.predict_proba(X4_features)[:,1]

########### Kmeans ##########
# N=3 # 4 seems optimal (i tried 2 to 12 clusters)
# kmeans = KMeans(n_clusters=N, init='k-means++', max_iter=50, random_state=0)
# kmeans.fit(X_features)
# scaler=[RobustScaler() for _ in range(N)]
# distances_train, centres_train = kmeans.transform(X_features), kmeans.predict(X_features)
# distnear_train = distances_train[range(centres_train.shape[0]), centres_train]
# distances_test, centres_test = kmeans.transform(X4_features), kmeans.predict(X4_features)
# distnear_test = distances_test[range(centres_test.shape[0]), centres_test]

# for i in range(N):
#     scaler[i].fit(distnear_train[centres_train==i].reshape(-1,1))
# for i in range(N):
#     distnear_test[centres_test==i] = scaler[i].transform(distnear_test[centres_test==i].reshape(-1,1)).reshape(-1)

    
########## Gaussian Mixture ##########
NN = 1
gm = GaussianMixture(n_components=NN, random_state=0, covariance_type="full").fit(X_train)
y_pred_gm = -gm.score_samples(X4_features)
    
    
# Print AUC scores
print("######################### AUC SCORES ######################### ")
print("Isolation Forest:", roc_auc_score(y4,-IL.score_samples(X4_features)))
print("Logistic Regression:", roc_auc_score(y4,y_pred_lr))
# print("KMeans:", roc_auc_score(y_test, distnear_test))
print("Gaussian Mixture:", roc_auc_score(y4, y_pred_gm))
print("##############################################################")

######################### AUC SCORES ######################### 
Isolation Forest: 0.6889433333333332
Logistic Regression: 0.7724233333333332
Gaussian Mixture: 0.6823166666666667
##############################################################


In [None]:
predictions = pd.DataFrame(y_pred_proba, columns=['y_pred'])
predictions

In [None]:
np.savez('W11_predictions_emmanuel.npz', predictions)