In [9]:
import pandas as pd

# Initialize lists to store data
data = {'MovieID': [], 'UserID': [], 'Rating': [], 'Date': []}

# Open the file and parse it
with open('../archive/combined_data_1.txt', 'r') as file:
    movie_id = None
    for line in file:
        if ':' in line:
            movie_id = line.split(':')[0]
        else:
            user_id, rating, date = line.split(',')
            data['MovieID'].append(movie_id)
            data['UserID'].append(user_id)
            data['Rating'].append(rating)
            data['Date'].append(date.strip())

# Convert to DataFrame
df = pd.DataFrame(data)


In [11]:
df.head()

Unnamed: 0,MovieID,UserID,Rating,Date
0,1,1488844,3,2005-09-06
1,1,822109,5,2005-05-13
2,1,885013,4,2005-10-19
3,1,30878,4,2005-12-26
4,1,823519,3,2004-05-03


In [13]:
movies = pd.read_csv('../archive/movie_titles.csv', header=None, names=['MovieID', 'Year', 'Title'], encoding='ISO-8859-1')

In [14]:
df["MovieID"] = df["MovieID"].astype(int)
df["UserID"] = df["UserID"].astype(int)
df["Rating"] = df["Rating"].astype(int)
df["Date"] = pd.to_datetime(df["Date"])

In [15]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 24053764 entries, 0 to 24053763
Data columns (total 4 columns):
 #   Column   Dtype         
---  ------   -----         
 0   MovieID  int32         
 1   UserID   int32         
 2   Rating   int32         
 3   Date     datetime64[ns]
dtypes: datetime64[ns](1), int32(3)
memory usage: 458.8 MB


In [16]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype  
---  ------   --------------  -----  
 0   MovieID  17770 non-null  int64  
 1   Year     17763 non-null  float64
 2   Title    17770 non-null  object 
dtypes: float64(1), int64(1), object(1)
memory usage: 416.6+ KB


In [68]:
merged_df = pd.merge(df, movies, on='MovieID', how='left')

In [69]:
merged_df = merged_df.dropna()

In [70]:
merged_df["Year"] = merged_df["Year"].astype(int)

In [31]:
merged_df.head()

Unnamed: 0,MovieID,UserID,Rating,Date,Year,Title
0,1,1488844,3,2005-09-06,2003,Dinosaur Planet
1,1,822109,5,2005-05-13,2003,Dinosaur Planet
2,1,885013,4,2005-10-19,2003,Dinosaur Planet
3,1,30878,4,2005-12-26,2003,Dinosaur Planet
4,1,823519,3,2004-05-03,2003,Dinosaur Planet


In [71]:
merged_df = merged_df.drop(columns=['MovieID'])

In [72]:
merged_df.head()

Unnamed: 0,UserID,Rating,Date,Year,Title
0,1488844,3,2005-09-06,2003,Dinosaur Planet
1,822109,5,2005-05-13,2003,Dinosaur Planet
2,885013,4,2005-10-19,2003,Dinosaur Planet
3,30878,4,2005-12-26,2003,Dinosaur Planet
4,823519,3,2004-05-03,2003,Dinosaur Planet


In [73]:
merged_df = merged_df.sort_values(by='UserID')

In [74]:
merged_df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 24053575 entries, 16820592 to 21583989
Data columns (total 5 columns):
 #   Column  Dtype         
---  ------  -----         
 0   UserID  int32         
 1   Rating  int32         
 2   Date    datetime64[ns]
 3   Year    int32         
 4   Title   object        
dtypes: datetime64[ns](1), int32(3), object(1)
memory usage: 825.8+ MB


In [75]:
merged_df = merged_df.head(200000)

In [62]:
merged_df.to_csv('data_without_embeddings.csv', index=False)

In [76]:
merged_df

Unnamed: 0,UserID,Rating,Date,Year,Title
16820592,6,3,2005-12-04,1990,The Grifters
2547216,6,3,2005-11-25,2003,The Matrix: Revolutions
9273482,6,3,2005-03-18,1968,Rosemary's Baby
19879789,6,2,2004-09-27,1973,The Sting
2461952,6,5,2004-09-15,2004,Kill Bill: Vol. 2
...,...,...,...,...,...
20463318,21701,5,2003-10-16,1994,NYPD Blue: Season 2
18222496,21701,4,2005-11-02,1999,10 Things I Hate About You
12175160,21701,1,2004-08-15,2001,Just Visiting
3164120,21701,3,2003-12-29,1999,American Beauty


In [77]:
from sentence_transformers import SentenceTransformer

model = SentenceTransformer('all-MiniLM-L6-v2')

titles = merged_df['Title'].tolist()
title_embeddings = model.encode(titles)


In [72]:
import numpy as np

np.save('title_embeddings.npy', title_embeddings)

In [18]:
embeddings = np.load('../data/title_embeddings.npy')

In [78]:
merged_df["embeddings"] =title_embeddings.tolist()

In [80]:
merged_df["Year"] = (merged_df["Year"]-1890)/115

In [81]:
merged_df

Unnamed: 0,UserID,Rating,Date,Year,Title,embeddings
16820592,6,3,2005-12-04,0.869565,The Grifters,"[-0.01786697655916214, -0.013425135053694248, ..."
2547216,6,3,2005-11-25,0.982609,The Matrix: Revolutions,"[-0.04949893802404404, 0.019510140642523766, -..."
9273482,6,3,2005-03-18,0.678261,Rosemary's Baby,"[-0.06936520338058472, -0.09750981628894806, -..."
19879789,6,2,2004-09-27,0.721739,The Sting,"[-0.08671136200428009, 0.08106035739183426, -0..."
2461952,6,5,2004-09-15,0.991304,Kill Bill: Vol. 2,"[-0.03953291103243828, 0.06410317867994308, 0...."
...,...,...,...,...,...,...
20463318,21701,5,2003-10-16,0.904348,NYPD Blue: Season 2,"[-0.0771060511469841, -0.023183060809969902, -..."
18222496,21701,4,2005-11-02,0.947826,10 Things I Hate About You,"[-0.06518016010522842, 0.08766945451498032, 0...."
12175160,21701,1,2004-08-15,0.965217,Just Visiting,"[0.050976503640413284, -0.08134657144546509, 0..."
3164120,21701,3,2003-12-29,0.947826,American Beauty,"[0.024971242994070053, -0.06603381037712097, 0..."


In [82]:
import numpy as np

In [89]:
merged_df_cut  = merged_df.head(100000)

In [83]:
def concatenate_features(row):
    return np.concatenate(([row['Year']], row['embeddings']))
merged_df['combined'] = merged_df.apply(concatenate_features, axis=1)

In [90]:
merged_df_cut

Unnamed: 0,UserID,Rating,Date,Year,Title,embeddings,combined
16820592,6,3,2005-12-04,0.869565,The Grifters,"[-0.01786697655916214, -0.013425135053694248, ...","[0.8695652173913043, -0.01786697655916214, -0...."
2547216,6,3,2005-11-25,0.982609,The Matrix: Revolutions,"[-0.04949893802404404, 0.019510140642523766, -...","[0.9826086956521739, -0.04949893802404404, 0.0..."
9273482,6,3,2005-03-18,0.678261,Rosemary's Baby,"[-0.06936520338058472, -0.09750981628894806, -...","[0.6782608695652174, -0.06936520338058472, -0...."
19879789,6,2,2004-09-27,0.721739,The Sting,"[-0.08671136200428009, 0.08106035739183426, -0...","[0.7217391304347827, -0.08671136200428009, 0.0..."
2461952,6,5,2004-09-15,0.991304,Kill Bill: Vol. 2,"[-0.03953291103243828, 0.06410317867994308, 0....","[0.991304347826087, -0.03953291103243828, 0.06..."
...,...,...,...,...,...,...,...
8315899,11043,2,2005-10-29,0.939130,Two Girls and a Guy,"[0.03250643610954285, -0.004106117878109217, -...","[0.9391304347826087, 0.03250643610954285, -0.0..."
15729179,11043,4,2003-10-15,0.939130,Shadow of Doubt,"[-0.07806366682052612, -0.03144596889615059, -...","[0.9391304347826087, -0.07806366682052612, -0...."
18445870,11043,4,2004-10-21,0.895652,Undercover Blues,"[-0.12121220678091049, -0.003152482910081744, ...","[0.8956521739130435, -0.12121220678091049, -0...."
22571793,11043,3,2004-11-18,0.895652,Double,"[0.0018072213279083371, 0.022918827831745148, ...","[0.8956521739130435, 0.0018072213279083371, 0...."


In [91]:
grouped = merged_df_cut.groupby('UserID')

In [92]:
import numpy as np

T = len(grouped)
#n = grouped.size().max()
n = 200
d = len(merged_df_cut['combined'].iloc[0])

X = np.zeros((T, n, d))  # Shape: (T, n, d)
Y = np.zeros((T, n))  

for i, (user_id, user_data) in enumerate(grouped):
    features = np.stack(user_data['combined'].values)[:200]  # Shape: (num_rows_user, d)
    labels = user_data['Rating'].values[:200]                    # Shape: (num_rows_user,)
    
    num_rows_user = len(features)
    
    # Insert the data into X and Y, padding as necessary
    X[i, :num_rows_user, :] = features
    Y[i, :num_rows_user] = labels

In [93]:
X.shape

(1961, 1062, 385)

In [94]:
Y.shape

(1961, 1062)

In [95]:
np.save('../data/X_100k.npy', X)
np.save('../data/Y_100k.npy', Y)