### CS 421 PROJECT

In [1]:
import numpy as np
import pandas as pd

# Load all previous weeks' data as training sets
batch1=np.load("first_batch.npz")
batch2=np.load("second_batch_with_labels.npz")
batch3=np.load("third_batch_with_labels.npz")

In [11]:
X1=pd.DataFrame(batch1["X"], columns=["user", "item", "rating"])
y1=pd.DataFrame(batch1["y"], columns=["user", "label"])
X2=pd.DataFrame(batch2["X"], columns=["user", "item", "rating"])
y2=pd.DataFrame(batch2["y"], columns=["user", "label"])
X3=pd.DataFrame(batch3["X"], columns=["user", "item", "rating"])
y3=pd.DataFrame(batch3["y"], columns=["user", "label"])

# Concatenate all previous weeks' batches into a single dataframe each
data_X = [X1, X2, X3]
data_y = [y1, y2, y3]
X = pd.concat(data_X)
y = pd.concat(data_y)

In [6]:
X.head()

Unnamed: 0,user,item,rating
0,2549,0,4
1,2549,30,3
2,2549,55,2
3,2549,67,2
4,2549,72,4


In [12]:
y.drop('user', axis=1, inplace=True)
y.head()

Unnamed: 0,label
0,0
1,0
2,0
3,0
4,0


In [13]:
def get_features(X: pd.DataFrame):
  features = pd.DataFrame(X.groupby('user')['user'].max())
  # features['mode'] = X.groupby('user')['rating'].apply(lambda x: x.mode())
  features['mean'] = X.groupby('user')['rating'].mean()
  features['median'] = X.groupby('user')['rating'].median()
  features['max'] = X.groupby('user')['rating'].max()
  features['min'] = X.groupby('user')['rating'].min()
  features['mode'] = X.groupby('user')['rating'].apply(lambda x: x.value_counts().index[0])
  return features

# X1_features = get_features(X1)
# X2_features = get_features(X2)
# X3_features = get_features(X3)
X_features = get_features(X)


In [77]:
from sklearn.model_selection import train_test_split

# Anomaly detection methods
from sklearn.ensemble import IsolationForest
from sklearn.linear_model import LogisticRegression
from sklearn.cluster import KMeans
from sklearn.preprocessing import StandardScaler, RobustScaler
from sklearn.mixture import GaussianMixture

from sklearn.metrics import auc, roc_auc_score, precision_score, recall_score, f1_score

In [83]:
X_train, X_test, y_train, y_test = train_test_split(X_features, y, test_size=0.2, random_state=42)

########### Isolation Forest ##########
IL=IsolationForest(n_estimators=500)
IL.fit(X_train)


########### Logistic Regression ##########
lr = LogisticRegression()
lr.fit(X_train, np.ravel(y_train, order='C'))
y_pred_lr = lr.predict_proba(X_test)[:,1]

########### Kmeans ##########
N=4 # 4 seems optimal (i tried 2 to 12 clusters)
kmeans = KMeans(n_clusters=N, init='k-means++', max_iter=50, random_state=0)
kmeans.fit(X_train)
scaler=[RobustScaler() for _ in range(N)]
distances_train, centres_train = kmeans.transform(X_train), kmeans.predict(X_train)
distnear_train = distances_train[range(centres_train.shape[0]), centres_train]
distances_test, centres_test = kmeans.transform(X_test), kmeans.predict(X_test)
distnear_test = distances_test[range(centres_test.shape[0]), centres_test]

for i in range(N):
    scaler[i].fit(distnear_train[centres_train==i].reshape(-1,1))
for i in range(N):
    distnear_test[centres_test==i] = scaler[i].transform(distnear_test[centres_test==i].reshape(-1,1)).reshape(-1)
    
########## Gaussian Mixture ##########
NN = 1
gm = GaussianMixture(n_components=NN, random_state=0, covariance_type="full").fit(X_train)
y_pred_gm = -gm.score_samples(X_test)
    
    
# Print AUC scores
print("######################### AUC SCORES ######################### ")
print("Isolation Forest:", roc_auc_score(y_test,-IL.score_samples(X_test)))
print("Logistic Regression:", roc_auc_score(y_test,y_pred_lr))
print("KMeans:", roc_auc_score(y_test, distnear_test))
print("Gaussian Mixture:", roc_auc_score(y_test, y_pred_gm))
print("##############################################################")


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(


######################### AUC SCORES ######################### 
Isolation Forest: 0.6879444304508728
Logistic Regression: 0.8958700664587007
KMeans: 0.5343870492724418
Gaussian Mixture: 0.7156517021565171
##############################################################


In [51]:
# Load and predict on fourth batch
batch4 = np.load('fourth_batch.npz')
X4 = pd.DataFrame(batch4['X'], columns=["user", "item", "rating"])
X4.nunique()

user      1300
item       990
rating       6
dtype: int64

In [None]:
X4_features = get_features(X4)

y_pred_logreg = logreg.predict(X4_features)
y_pred_proba = logreg.predict_proba(X4_features)[::,1]


# Evaluation
print("------Logistic regression evaluation metrics------")
print(f"AUC: {roc_auc_score(y_test, y_pred_proba)}")
print(f"Precision: {precision_score(y_test, y_pred_logreg)}")
print(f"Recall: {recall_score(y_test, y_pred_logreg)}")
print(f"F1: {f1_score(y_test, y_pred_logreg)}")

In [None]:
predictions = pd.DataFrame(y_pred_proba, columns=['y_pred'])
predictions

In [None]:
np.savez('W11_predictions_emmanuel.npz', predictions)