# Introduction

This Notebooks is a join notebook from both the prepare_data and pytorch-bst in order to be run in google colab.

# Prepare data section

In [None]:
import pandas as pd
import torch
from tqdm import tqdm
import math
from urllib.request import urlretrieve
from zipfile import ZipFile
import os
import numpy as np
from math import sqrt
from numpy.random import choice
import math
import random

## Settings

In [None]:
WINDOW_SIZE = 20

## Data

In [None]:
urlretrieve("http://files.grouplens.org/datasets/movielens/ml-1m.zip", "movielens.zip")
ZipFile("movielens.zip", "r").extractall()

In [None]:
users = pd.read_csv(
    "ml-1m/users.dat",
    sep="::",
    names=["user_id", "sex", "age_group", "occupation", "zip_code"],
)

ratings = pd.read_csv(
    "ml-1m/ratings.dat",
    sep="::",
    names=["user_id", "movie_id", "rating", "unix_timestamp"],
)

movies = pd.read_csv(
    "ml-1m/movies.dat", sep="::", names=["movie_id", "title", "genres"], encoding="ISO-8859-1"
)

  return func(*args, **kwargs)


In [None]:
## Movies
movies["year"] = movies["title"].apply(lambda x: x[-5:-1])
movies.year = pd.Categorical(movies.year)
movies["year"] = movies.year.cat.codes
## Users
users.sex = pd.Categorical(users.sex)
users["sex"] = users.sex.cat.codes


users.age_group = pd.Categorical(users.age_group)
users["age_group"] = users.age_group.cat.codes


users.occupation = pd.Categorical(users.occupation)
users["occupation"] = users.occupation.cat.codes


users.zip_code = pd.Categorical(users.zip_code)
users["zip_code"] = users.zip_code.cat.codes

#Ratings
ratings['unix_timestamp'] = pd.to_datetime(ratings['unix_timestamp'],unit='s')


In [None]:
# Save primary csv's
if not os.path.exists('data'):
    os.makedirs('data')
    
    
users.to_csv("data/users.csv",index=False)
movies.to_csv("data/movies.csv",index=False)
ratings.to_csv("data/ratings.csv",index=False)

In [None]:
## Movies
movies["movie_id"] = movies["movie_id"].astype(str)
## Users
users["user_id"] = users["user_id"].astype(str)

##Ratings 
ratings["movie_id"] = ratings["movie_id"].astype(str)
ratings["user_id"] = ratings["user_id"].astype(str)

In [None]:
genres = [
    "Action",
    "Adventure",
    "Animation",
    "Children's",
    "Comedy",
    "Crime",
    "Documentary",
    "Drama",
    "Fantasy",
    "Film-Noir",
    "Horror",
    "Musical",
    "Mystery",
    "Romance",
    "Sci-Fi",
    "Thriller",
    "War",
    "Western",
]

for genre in genres:
    movies[genre] = movies["genres"].apply(
        lambda values: int(genre in values.split("|"))
    )


### Transform the movie ratings data into sequences

First, let's sort the the ratings data using the `unix_timestamp`, and then group the
`movie_id` values and the `rating` values by `user_id`.

The output DataFrame will have a record for each `user_id`, with two ordered lists
(sorted by rating datetime): the movies they have rated, and their ratings of these movies.

In [None]:
ratings_group = ratings.sort_values(by=["unix_timestamp"]).groupby("user_id")

ratings_data = pd.DataFrame(
    data={
        "user_id": list(ratings_group.groups.keys()),
        "movie_ids": list(ratings_group.movie_id.apply(list)),
        "ratings": list(ratings_group.rating.apply(list)),
        "timestamps": list(ratings_group.unix_timestamp.apply(list)),
    }
)


Now, let's split the `movie_ids` list into a set of sequences of a fixed length.
We do the same for the `ratings`. Set the `sequence_length` variable to change the length
of the input sequence to the model. You can also change the `step_size` to control the
number of sequences to generate for each user.

In [None]:
ratings_data

Unnamed: 0,user_id,movie_ids,ratings,timestamps
0,1,"[3186, 1721, 1270, 1022, 2340, 1836, 3408, 120...","[4, 4, 5, 5, 3, 5, 4, 4, 5, 4, 3, 5, 4, 4, 4, ...","[2000-12-31 22:00:19, 2000-12-31 22:00:55, 200..."
1,10,"[597, 858, 743, 1210, 1948, 2312, 3751, 1282, ...","[4, 3, 3, 4, 4, 5, 5, 5, 3, 3, 3, 5, 4, 4, 4, ...","[2000-12-31 00:59:35, 2000-12-31 00:59:35, 200..."
2,100,"[260, 1676, 1198, 541, 1210, 3948, 3536, 2567,...","[4, 3, 4, 3, 4, 3, 1, 1, 5, 4, 4, 3, 2, 3, 4, ...","[2000-12-23 17:46:35, 2000-12-23 17:46:35, 200..."
3,1000,"[971, 260, 2990, 2973, 1210, 3068, 3153, 1198,...","[4, 5, 4, 3, 5, 5, 2, 5, 5, 4, 5, 4, 3, 5, 5, ...","[2000-11-24 04:36:06, 2000-11-24 04:36:06, 200..."
4,1001,"[1198, 1617, 2885, 3909, 3555, 1479, 3903, 394...","[4, 4, 4, 2, 2, 1, 4, 5, 5, 4, 4, 4, 4, 3, 4, ...","[2000-11-24 04:19:51, 2000-11-24 04:21:42, 200..."
...,...,...,...,...
6035,995,"[1894, 260, 247, 433, 170, 74, 912, 3097, 1265...","[2, 4, 5, 3, 3, 4, 4, 4, 3, 5, 5, 5, 5, 5, 5, ...","[2000-11-24 08:33:05, 2000-11-24 08:33:05, 200..."
6036,996,"[1347, 2146, 1961, 2741, 1210, 527, 1196, 1213...","[4, 3, 5, 3, 5, 5, 5, 5, 4, 2, 5, 5, 5, 4, 5, ...","[2000-11-24 07:48:52, 2000-11-24 07:48:52, 200..."
6037,997,"[1196, 2082, 3247, 2447, 2633, 2028, 593, 318,...","[4, 3, 3, 3, 2, 5, 5, 5, 4, 4, 5, 4, 4, 3, 4, ...","[2000-11-24 05:37:15, 2000-11-24 05:40:25, 200..."
6038,998,"[2266, 1264, 1097, 1641, 805, 1388, 1968, 3751...","[3, 4, 5, 5, 4, 3, 4, 3, 4, 4, 4, 4, 5, 5, 4, ...","[2000-11-24 05:24:59, 2000-11-24 05:26:33, 200..."


# Here is where we change the data into different ratings

* ratings_data -> no changes to the sequence
* inputed_ratings_data -> inputed ratings
* random_ratings_data 

Truth -> random sequences

Toy Story III - Toy Story II - Toy Story -  Toy Story IV

Toy Story - Toy Story II - Toy Story III - Toy Story IV
Toy Story II - Toy Story III - Titanic - Toy Story IV

### Sequence creation code

# Methods for sequence perturbation and perturbation calculation

### Random Inputation Methods

In [None]:
def swap_elements(x, t, ratings):
  new_x = x[:]
  new_ratings = ratings[:]
  for idx, value in zip(choice(range(len(x)), size=len(t), replace=False), t):
      new_x[idx] = value
      new_ratings[idx] = int(random.randint(1,5))
  return new_x, new_ratings

In [None]:
def random_inputation1(df, col1 ,col2 ,val, percentage = True):
  # takes in the dataframe where we will inpute the data as df
  # the all movies set is used so that we inpute movies that are not watched by the user
  # col1 = name of the movie sequence column
  # col2 = name of the rating sequence column
  # the val is the amount of perturbation to the list it can be either a percentage or the number of values we want to perturb
  inp_movie_ids = []
  inp_ratings = []
  for index, row in df.iterrows():
    user_movies_sequence = row[f'{col1}']
    ratings = row[f'{col2}']
    n = len(user_movies_sequence)
    
    if percentage == True:
      n_chose = math.floor(n * val)
    if percentage == False:
      n_chose = val
    movies_to_input = random.choices(row['other_movies'],k =n_chose )
    new_sequence, new_ratings = swap_elements(user_movies_sequence,movies_to_input, ratings)
    inp_movie_ids.append(new_sequence)
    inp_ratings.append(new_ratings)
  df['inp_movie_ids'] = inp_movie_ids
  df['inp_ratings'] = inp_ratings
  return df

### Corruption Measurement

Counts the difference between the original sequence and the "perturbed" sequence

In [None]:
def measure_corruption(df, column1 , column2 ):
  perc_change = []
  n_corrupted = []
  for index, row in df.iterrows():
    a = row[f'{column1}']
    b= row[f'{column2}']
    res = 0
    for i in range(len(a)):
      if a[i] != b[i]:
        res +=1
    perc_change.append(round(res/len(a)*100, 2))
    n_corrupted.append(res)
  df['percentage_change'] = perc_change
  df['movies_corrupted'] = n_corrupted
  return df

In [None]:
ratings_data

Unnamed: 0,user_id,movie_ids,ratings,timestamps
0,1,"[3186, 1721, 1270, 1022, 2340, 1836, 3408, 120...","[4, 4, 5, 5, 3, 5, 4, 4, 5, 4, 3, 5, 4, 4, 4, ...","[2000-12-31 22:00:19, 2000-12-31 22:00:55, 200..."
1,10,"[597, 858, 743, 1210, 1948, 2312, 3751, 1282, ...","[4, 3, 3, 4, 4, 5, 5, 5, 3, 3, 3, 5, 4, 4, 4, ...","[2000-12-31 00:59:35, 2000-12-31 00:59:35, 200..."
2,100,"[260, 1676, 1198, 541, 1210, 3948, 3536, 2567,...","[4, 3, 4, 3, 4, 3, 1, 1, 5, 4, 4, 3, 2, 3, 4, ...","[2000-12-23 17:46:35, 2000-12-23 17:46:35, 200..."
3,1000,"[971, 260, 2990, 2973, 1210, 3068, 3153, 1198,...","[4, 5, 4, 3, 5, 5, 2, 5, 5, 4, 5, 4, 3, 5, 5, ...","[2000-11-24 04:36:06, 2000-11-24 04:36:06, 200..."
4,1001,"[1198, 1617, 2885, 3909, 3555, 1479, 3903, 394...","[4, 4, 4, 2, 2, 1, 4, 5, 5, 4, 4, 4, 4, 3, 4, ...","[2000-11-24 04:19:51, 2000-11-24 04:21:42, 200..."
...,...,...,...,...
6035,995,"[1894, 260, 247, 433, 170, 74, 912, 3097, 1265...","[2, 4, 5, 3, 3, 4, 4, 4, 3, 5, 5, 5, 5, 5, 5, ...","[2000-11-24 08:33:05, 2000-11-24 08:33:05, 200..."
6036,996,"[1347, 2146, 1961, 2741, 1210, 527, 1196, 1213...","[4, 3, 5, 3, 5, 5, 5, 5, 4, 2, 5, 5, 5, 4, 5, ...","[2000-11-24 07:48:52, 2000-11-24 07:48:52, 200..."
6037,997,"[1196, 2082, 3247, 2447, 2633, 2028, 593, 318,...","[4, 3, 3, 3, 2, 5, 5, 5, 4, 4, 5, 4, 4, 3, 4, ...","[2000-11-24 05:37:15, 2000-11-24 05:40:25, 200..."
6038,998,"[2266, 1264, 1097, 1641, 805, 1388, 1968, 3751...","[3, 4, 5, 5, 4, 3, 4, 3, 4, 4, 4, 4, 5, 5, 4, ...","[2000-11-24 05:24:59, 2000-11-24 05:26:33, 200..."


# Only procedure

Perturb the whole movie sequence and then feed it into models by building sequences

In [None]:
# Import Drive API and authenticate.
# from google.colab import drive

# # Mount your Drive to the Colab VM.
# drive.mount('/gdrive')

# # Write the DataFrame to CSV file.
# with open('/gdrive/My Drive/547 Project/data/train_data_final.csv', 'w') as f:
#   train_data_final.to_csv(f)

In [None]:
# we keep a copy for second procedure
perturbation_0 = ratings_data.copy(deep=True)

In [None]:
# No perturbation

In [None]:
# Import Drive API and authenticate.
from google.colab import drive

# Mount your Drive to the Colab VM.
drive.mount('/gdrive')

# Write the DataFrame to CSV file.
with open('/gdrive/My Drive/547 Project/data/truth.csv', 'w') as f:
  perturbation_0.to_csv(f)

Mounted at /gdrive


In [None]:
def shuffle(movie_sequence, ratings, count):
    '''Shuffles any n number of values in a list'''
    indices_to_shuffle = random.sample(range(len(movie_sequence)), k=count)
    old_indices = indices_to_shuffle.copy()
    random.shuffle(indices_to_shuffle)
    new_movie_sequence = movie_sequence.copy()
    new_ratings_sequence = ratings.copy()
    for index, value in enumerate(indices_to_shuffle):
        old_index = old_indices[index]
        new_movie_sequence[old_index] = movie_sequence[value]
        new_ratings_sequence[old_index] = ratings[value]
    
    return new_movie_sequence,new_ratings_sequence

In [None]:
def random_shuffling(df,col1,col2, val, percentage = True):
  # takes in the dataframe where we will inpute the data as df
  # the all movies set is used so that we inpute movies that are not watched by the user
  # the percentage is the amount of perturbation to the list
  new_sequences = []
  new_rating_lists = []
  for index, row in df.iterrows():
    user_movies_sequence = row[f'{col1}']
    ratings = row[f'{col2}']
    n = len(user_movies_sequence)
    if percentage:
      n_shuffle = math.floor(n * val)
    else:
      n_shuffle = val
    new_seq, new_ratings = shuffle(user_movies_sequence, ratings, n_shuffle)
    new_sequences.append(new_seq)
    new_rating_lists.append(new_ratings)
  df['random_movie_ids'] = new_sequences
  df['random_ratings'] = new_rating_lists
  return df

# Second Procedure (2) 

First perturb movie list and then separate into window sequences

## Random Inputation

In [None]:
def random_inputation2(df, all_movies_set, col1 ,col2 ,val, percentage = True):
  # takes in the dataframe where we will inpute the data as df
  # the all movies set is used so that we inpute movies that are not watched by the user
  # col1 = name of the movie sequence column
  # col2 = name of the rating sequence column
  # the val is the amount of perturbation to the list it can be either a percentage or the number of values we want to perturb
  random.seed(10)
  inp_movie_ids = []
  inp_ratings = []
  for index, row in df.iterrows():
    user_movies_sequence = row[f'{col1}']
    ratings = row[f'{col2}']
    n = len(user_movies_sequence)
    user_movies_sequence_set = set(user_movies_sequence)
    difference_set = all_movies_set - user_movies_sequence_set
    if percentage:
      n_chose = math.floor(n * val)
    else:
      n_chose = val
    movies_to_input = random.choices(list(difference_set),k =n_chose )
    user_movies_sequence, new_ratings = swap_elements(user_movies_sequence,movies_to_input, ratings)
    inp_movie_ids.append(user_movies_sequence)
    inp_ratings.append(new_ratings)
  df['inp_movie_ids'] = inp_movie_ids
  df['inp_ratings'] = inp_ratings
  return df

In [None]:
all_movies_set = set(movies.movie_id.to_list())

In [None]:
# inpute 10% of movies into sequence
random.seed(10)
inputed_ratings_data_10 = ratings_data.copy(deep=True)
inputed_ratings_data_10 = random_inputation2(inputed_ratings_data_10,all_movies_set,'movie_ids','ratings', 0.1, True) # we can change the percentage we want

In [None]:
inputed_ratings_data_10

Unnamed: 0,user_id,movie_ids,ratings,timestamps,inp_movie_ids,inp_ratings
0,1,"[3186, 1721, 1270, 1022, 2340, 1836, 3408, 120...","[4, 4, 5, 5, 3, 5, 4, 4, 5, 4, 3, 5, 4, 4, 4, ...","[2000-12-31 22:00:19, 2000-12-31 22:00:55, 200...","[3186, 1721, 1270, 1022, 2340, 1836, 3408, 120...","[4, 4, 5, 5, 3, 5, 4, 4, 1, 4, 3, 5, 4, 4, 2, ..."
1,10,"[597, 858, 743, 1210, 1948, 2312, 3751, 1282, ...","[4, 3, 3, 4, 4, 5, 5, 5, 3, 3, 3, 5, 4, 4, 4, ...","[2000-12-31 00:59:35, 2000-12-31 00:59:35, 200...","[3440, 858, 743, 955, 2626, 2312, 3751, 1282, ...","[4, 3, 3, 5, 1, 5, 5, 5, 3, 4, 3, 5, 4, 4, 4, ..."
2,100,"[260, 1676, 1198, 541, 1210, 3948, 3536, 2567,...","[4, 3, 4, 3, 4, 3, 1, 1, 5, 4, 4, 3, 2, 3, 4, ...","[2000-12-23 17:46:35, 2000-12-23 17:46:35, 200...","[260, 1676, 1198, 541, 1210, 3948, 3536, 111, ...","[4, 3, 4, 3, 4, 3, 1, 1, 5, 2, 4, 4, 2, 3, 4, ..."
3,1000,"[971, 260, 2990, 2973, 1210, 3068, 3153, 1198,...","[4, 5, 4, 3, 5, 5, 2, 5, 5, 4, 5, 4, 3, 5, 5, ...","[2000-11-24 04:36:06, 2000-11-24 04:36:06, 200...","[971, 260, 2990, 3778, 1210, 3068, 2843, 1198,...","[4, 5, 4, 1, 5, 5, 4, 5, 5, 4, 5, 4, 3, 5, 5, ..."
4,1001,"[1198, 1617, 2885, 3909, 3555, 1479, 3903, 394...","[4, 4, 4, 2, 2, 1, 4, 5, 5, 4, 4, 4, 4, 3, 4, ...","[2000-11-24 04:19:51, 2000-11-24 04:21:42, 200...","[1198, 1617, 3234, 3909, 3555, 320, 3903, 3949...","[4, 4, 4, 2, 2, 3, 4, 5, 5, 4, 4, 4, 4, 3, 4, ..."
...,...,...,...,...,...,...
6035,995,"[1894, 260, 247, 433, 170, 74, 912, 3097, 1265...","[2, 4, 5, 3, 3, 4, 4, 4, 3, 5, 5, 5, 5, 5, 5, ...","[2000-11-24 08:33:05, 2000-11-24 08:33:05, 200...","[1894, 260, 247, 433, 170, 74, 912, 2215, 515,...","[2, 4, 5, 3, 3, 4, 4, 3, 3, 5, 5, 5, 5, 5, 5, ..."
6036,996,"[1347, 2146, 1961, 2741, 1210, 527, 1196, 1213...","[4, 3, 5, 3, 5, 5, 5, 5, 4, 2, 5, 5, 5, 4, 5, ...","[2000-11-24 07:48:52, 2000-11-24 07:48:52, 200...","[1347, 2146, 1961, 2741, 1210, 527, 1196, 1213...","[4, 3, 5, 3, 5, 5, 5, 5, 4, 2, 5, 5, 5, 3, 5, ..."
6037,997,"[1196, 2082, 3247, 2447, 2633, 2028, 593, 318,...","[4, 3, 3, 3, 2, 5, 5, 5, 4, 4, 5, 4, 4, 3, 4, ...","[2000-11-24 05:37:15, 2000-11-24 05:40:25, 200...","[1196, 2082, 3247, 2447, 2633, 2028, 593, 3190...","[4, 3, 3, 3, 2, 5, 5, 4, 4, 4, 4, 4, 4, 3, 4, ..."
6038,998,"[2266, 1264, 1097, 1641, 805, 1388, 1968, 3751...","[3, 4, 5, 5, 4, 3, 4, 3, 4, 4, 4, 4, 5, 5, 4, ...","[2000-11-24 05:24:59, 2000-11-24 05:26:33, 200...","[2266, 1264, 1097, 1641, 3099, 1388, 1968, 375...","[3, 4, 5, 5, 2, 3, 4, 3, 4, 3, 4, 4, 5, 5, 1, ..."


In [None]:
# inpute 10% of movies into sequence
random.seed(10)
random_df = ratings_data.copy(deep=True)
inputed_ratings_data_10 = random_inputation2(random_df,all_movies_set,'movie_ids','ratings', 0.1, True) # we can change the percentage we want
# inpute 20% of movies into sequence
random.seed(10)
random_df = ratings_data.copy(deep=True)
inputed_ratings_data_20 = random_inputation2(random_df,all_movies_set,'movie_ids','ratings', 0.2, True) # we can change the percentage we want
# inpute 30% of movies into sequence
random.seed(10)
random_df = ratings_data.copy(deep=True)
inputed_ratings_data_30 = random_inputation2(random_df,all_movies_set,'movie_ids','ratings', 0.3, True) # we can change the percentage we want
# inpute 40% of movies into sequence
random.seed(10)
random_df = ratings_data.copy(deep=True)
inputed_ratings_data_40 = random_inputation2(random_df,all_movies_set,'movie_ids','ratings', 0.4, True) # we can change the percentage we want
# inpute 50% of movies into sequence
random.seed(10)
random_df = ratings_data.copy(deep=True)
inputed_ratings_data_50 = random_inputation2(random_df,all_movies_set,'movie_ids','ratings', 0.5, True) # we can change the percentage we want


In [None]:
# # Write the DataFrame to CSV file.
from google.colab import drive
drive.mount('/gdrive')
with open('/gdrive/My Drive/547 Project/data/inputed_ratings_data_10.csv', 'w') as f:
  inputed_ratings_data_10.to_csv(f)
with open('/gdrive/My Drive/547 Project/data/inputed_ratings_data_20.csv', 'w') as f:
  inputed_ratings_data_20.to_csv(f)
with open('/gdrive/My Drive/547 Project/data/inputed_ratings_data_30.csv', 'w') as f:
  inputed_ratings_data_30.to_csv(f)
with open('/gdrive/My Drive/547 Project/data/inputed_ratings_data_40.csv', 'w') as f:
  inputed_ratings_data_40.to_csv(f)
with open('/gdrive/My Drive/547 Project/data/inputed_ratings_data_50.csv', 'w') as f:
  inputed_ratings_data_50.to_csv(f)

Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).


## Random Sequences

In [None]:
# 10%
random.seed(10)
random_df = ratings_data.copy(deep=True)
random_ratings_data_10 = random_shuffling(random_df,'movie_ids','ratings', .1, True ) # we can change the percentage we want

# 20%
random.seed(10)
random_df = ratings_data.copy(deep=True)
random_ratings_data_20 = random_shuffling(random_df,'movie_ids','ratings', .2, True ) # we can change the percentage we want

# 30%
random.seed(10)
random_df = ratings_data.copy(deep=True)
random_ratings_data_30 = random_shuffling(random_df,'movie_ids','ratings', .3, True ) # we can change the percentage we want

# 40%
random.seed(10)
random_df = ratings_data.copy(deep=True)
random_ratings_data_40 = random_shuffling(random_df,'movie_ids','ratings', .4, True ) # we can change the percentage we want

# 50%
random.seed(10)
random_df = ratings_data.copy(deep=True)
random_ratings_data_50 = random_shuffling(random_df,'movie_ids','ratings', .5, True ) # we can change the percentage we want


In [None]:
# # Write the DataFrame to CSV file.
from google.colab import drive
drive.mount('/gdrive')
with open('/gdrive/My Drive/547 Project/data/random_ratings_data_10.csv', 'w') as f:
  random_ratings_data_10.to_csv(f)
with open('/gdrive/My Drive/547 Project/data/random_ratings_data_20.csv', 'w') as f:
  random_ratings_data_20.to_csv(f)
with open('/gdrive/My Drive/547 Project/data/random_ratings_data_30.csv', 'w') as f:
  random_ratings_data_30.to_csv(f)
with open('/gdrive/My Drive/547 Project/data/random_ratings_data_40.csv', 'w') as f:
  random_ratings_data_40.to_csv(f)
with open('/gdrive/My Drive/547 Project/data/random_ratings_data_50.csv', 'w') as f:
  random_ratings_data_50.to_csv(f)


Drive already mounted at /gdrive; to attempt to forcibly remount, call drive.mount("/gdrive", force_remount=True).
