### CS 421 PROJECT

In [227]:
import numpy as np
import pandas as pd

# Load all previous weeks' data as training sets
batch1=np.load("first_batch.npz")
batch2=np.load("second_batch_with_labels.npz")
batch3=np.load("third_batch_with_labels.npz")

In [228]:
X1=pd.DataFrame(batch1["X"], columns=["user", "item", "rating"])
y1=pd.DataFrame(batch1["y"], columns=["user", "label"])
X2=pd.DataFrame(batch2["X"], columns=["user", "item", "rating"])
y2=pd.DataFrame(batch2["y"], columns=["user", "label"])
X3=pd.DataFrame(batch3["X"], columns=["user", "item", "rating"])
y3=pd.DataFrame(batch3["y"], columns=["user", "label"])

# Concatenate all previous weeks' batches into a single dataframe each
data_X = [X1, X2, X3]
data_y = [y1, y2, y3]
X = pd.concat(data_X)
y = pd.concat(data_y)

y.drop('user', axis=1, inplace=True)

In [229]:
def get_features(X: pd.DataFrame):
  features = pd.DataFrame(X.groupby('user')['user'].max())
  features['count'] = X.groupby('user')['rating'].count()
  features['std'] = X.groupby('user')['rating'].std()
  features['25%'] = X.groupby('user')['rating'].quantile(0.25)
  features['50%'] = X.groupby('user')['rating'].quantile(0.50)
  features['75%'] = X.groupby('user')['rating'].quantile(0.75)
  
  features['mean'] = X.groupby('user')['rating'].mean()
  features['median'] = X.groupby('user')['rating'].median()
  features['max'] = X.groupby('user')['rating'].max()
  features['min'] = X.groupby('user')['rating'].min()
  features['mode'] = X.groupby('user')['rating'].apply(lambda x: x.value_counts().index[0])
  return features

X_features = get_features(X)
X_features = X_features.fillna(0)



In [230]:
from sklearn.model_selection import train_test_split

# Anomaly detection methods
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.mixture import GaussianMixture
from sklearn.ensemble import RandomForestClassifier

from sklearn.metrics import auc, roc_auc_score, precision_score, recall_score, f1_score

In [231]:
def get_metrics(y, list):
    for i in range (len(list)):
        print(f"AUC: {roc_auc_score(y, list[i][1])}")
        print(f"F1: {f1_score(y, list[i][0])}")
        print(f"Precision: {precision_score(y4, list[i][0])}")
        print(f"Recall: {recall_score(y, list[i][0])}")

In [232]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

########### Isolation Forest ##########
IL=IsolationForest(n_estimators=500)
IL.fit(X_train)

########### Logistic Regression ##########
lr = LogisticRegression()
lr.fit(X_train, np.ravel(y_train, order='C'))
y_pred_lr = lr.predict_proba(X_test)[:,1]

########### Kmeans ##########
N=4 # 4 seems optimal (i tried 2 to 12 clusters)
kmeans = KMeans(n_clusters=N, init='k-means++', max_iter=50, random_state=0)
kmeans.fit(X_train)
scaler=[RobustScaler() for _ in range(N)]
distances_train, centres_train = kmeans.transform(X_train), kmeans.predict(X_train)
distnear_train = distances_train[range(centres_train.shape[0]), centres_train]
distances_test, centres_test = kmeans.transform(X_test), kmeans.predict(X_test)
distnear_test = distances_test[range(centres_test.shape[0]), centres_test]

for i in range(N):
    scaler[i].fit(distnear_train[centres_train==i].reshape(-1,1))
for i in range(N):
    distnear_test[centres_test==i] = scaler[i].transform(distnear_test[centres_test==i].reshape(-1,1)).reshape(-1)
    
########## Gaussian Mixture ##########
NN = 1
gm = GaussianMixture(n_components=NN, random_state=0, covariance_type="full").fit(X_train)
y_pred_gm = -gm.score_samples(X_test)

########## Random Forest ##########
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_train, np.ravel(y_train, order='C'))
y_pred_rf = rf.predict_proba(X_test)[:,1]

    
    
# Print AUC scores
print("######################### AUC SCORES ######################### ")
print("Isolation Forest:", roc_auc_score(y_test,-IL.score_samples(X_test)))
print("Logistic Regression:", roc_auc_score(y_test,y_pred_lr))
print("KMeans:", roc_auc_score(y_test, distnear_test))
print("Gaussian Mixture:", roc_auc_score(y_test, y_pred_gm))
print("Random Forest:", roc_auc_score(y_test, y_pred_rf))
print("##############################################################")




######################### AUC SCORES ######################### 
Isolation Forest: 0.6709034895661778
Logistic Regression: 0.9077183158628974
KMeans: 0.5311173974540311
Gaussian Mixture: 0.6894605801088916
Random Forest: 0.925880626223092
##############################################################


In [233]:
# Load and predict on fourth batch
batch4 = np.load('fourth_batch_with_labels.npz')
X4 = pd.DataFrame(batch4['X'], columns=["user", "item", "rating"])
y4 = pd.DataFrame(batch4['y'], columns=["user", "label"])  
y4.drop("user", axis=1, inplace=True)

# Get features on 4th batch
X4_features = get_features(X4)

In [234]:
# Logistic Regression
lr.fit(X_features, np.ravel(y, order='C'))
y_pred_logreg = lr.predict(X4_features)
y_pred_proba = lr.predict_proba(X4_features)[::,1]
logreg = []
list = []
logreg.append(y_pred_logreg)
logreg.append(y_pred_proba)
list.append(logreg)


# Evaluation
print("------Logistic regression evaluation metrics------")
get_metrics(y4, list)

# Random Forest
rf.fit(X_features, np.ravel(y, order='C'))
y_pred_rf = rf.predict(X4_features)
y_pred_proba_rf = rf.predict_proba(X4_features)[:,1]

random_forest = []
list = []
random_forest.append(y_pred_rf)
random_forest.append(y_pred_proba_rf)
list.append(random_forest)


# Evaluation
print()
print("------Random Forest evaluation metrics------")
get_metrics(y4, list)





------Logistic regression evaluation metrics------
AUC: 0.83306
F1: 0.6284658040665435
Precision: 0.7053941908713693
Recall: 0.5666666666666667

------Random Forest evaluation metrics------
AUC: 0.8769883333333334
F1: 0.6181015452538632
Precision: 0.9150326797385621
Recall: 0.4666666666666667


In [235]:
### Prediction on 4th batch 

########### Isolation Forest ##########
IL=IsolationForest(n_estimators=500)
IL.fit(X_features)

########### Logistic Regression ##########
lr = LogisticRegression()
lr.fit(X_features, np.ravel(y, order='C'))
y_pred_lr = lr.predict_proba(X4_features)[:,1]

########### Kmeans ##########
# N=3 # 4 seems optimal (i tried 2 to 12 clusters)
# kmeans = KMeans(n_clusters=N, init='k-means++', max_iter=50, random_state=0)
# kmeans.fit(X_features)
# scaler=[RobustScaler() for _ in range(N)]
# distances_train, centres_train = kmeans.transform(X_features), kmeans.predict(X_features)
# distnear_train = distances_train[range(centres_train.shape[0]), centres_train]
# distances_test, centres_test = kmeans.transform(X4_features), kmeans.predict(X4_features)
# distnear_test = distances_test[range(centres_test.shape[0]), centres_test]

# for i in range(N):
#     scaler[i].fit(distnear_train[centres_train==i].reshape(-1,1))
# for i in range(N):
#     distnear_test[centres_test==i] = scaler[i].transform(distnear_test[centres_test==i].reshape(-1,1)).reshape(-1)

    
########## Gaussian Mixture ##########
NN = 1
gm = GaussianMixture(n_components=NN, random_state=0, covariance_type="full").fit(X_train)
y_pred_gm = -gm.score_samples(X4_features)


    
# Print AUC scores
print("######################### AUC SCORES ######################### ")
print("Isolation Forest:", roc_auc_score(y4,-IL.score_samples(X4_features)))
print("Logistic Regression:", roc_auc_score(y4,y_pred_lr))
# print("KMeans:", roc_auc_score(y_test, distnear_test))
print("Gaussian Mixture:", roc_auc_score(y4, y_pred_gm))
print("Random Forest:", roc_auc_score(y4, y_pred_proba_rf))
print("##############################################################")



######################### AUC SCORES ######################### 
Isolation Forest: 0.68783
Logistic Regression: 0.83306
Gaussian Mixture: 0.6846766666666667
Random Forest: 0.8769883333333334
##############################################################


In [236]:
predictions = pd.DataFrame(y_pred_proba, columns=['y_pred'])
predictions

Unnamed: 0,y_pred
0,0.163783
1,0.894505
2,0.250268
3,0.168176
4,0.299802
...,...
1295,0.479621
1296,0.318975
1297,0.322827
1298,0.879626


In [237]:
predictions_rf = pd.DataFrame(y_pred_rf, columns=['y_pred'])
predictions_rf

Unnamed: 0,y_pred
0,0
1,0
2,0
3,0
4,0
...,...
1295,0
1296,0
1297,0
1298,1


In [238]:
# Load and predict on final batch
batch_final = np.load('FINAL_batch.npz')
X_final = pd.DataFrame(batch4['X'], columns=["user", "item", "rating"])

# Get features on final batch
X_final_features = get_features(X_final)


In [239]:
# Random Forest for final batch
rf.fit(X_features, np.ravel(y, order='C'))
y_pred_rf = rf.predict(X_final_features)
y_pred_proba_rf = rf.predict_proba(X_final_features)[:,1]

In [240]:
np.savez('W12_predictions_final.npz', predictions)