# Import

In [1]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import surprise
from surprise.reader import Reader
from surprise.dataset import Dataset
from sklearn.metrics.pairwise import cosine_similarity
import scipy.sparse as sp
from tqdm import tqdm
import io
import csv

# Read Movie Titles

In [2]:
with open('data/movie_titles.csv', 'r', encoding='ISO-8859-1') as file:
    lines = csv.reader(file, delimiter=',')
    data = []
    for line in tqdm(lines):
        row = {}
#         print(line)
#         line = line.strip('\n')
#         line = line.split(',')
        row['movie_id'] = line[0]
        row['release_year'] = line[1]
        row['movie_name'] = ' '.join(line[2:])
        data.append(row)

## Movies data frame to store titles.

movies = pd.DataFrame(data)
movies.head()

17770it [00:00, 158718.84it/s]


Unnamed: 0,movie_id,release_year,movie_name
0,1,2003,Dinosaur Planet
1,2,2004,Isle of Man TT 2004 Review
2,3,1997,Character
3,4,1994,Paula Abdul's Get Up & Dance
4,5,2004,The Rise and Fall of ECW


In [3]:
movies['movie_id'] = movies.movie_id.astype(int)
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 17770 entries, 0 to 17769
Data columns (total 3 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   movie_id      17770 non-null  int64 
 1   release_year  17770 non-null  object
 2   movie_name    17770 non-null  object
dtypes: int64(1), object(2)
memory usage: 416.6+ KB


# Read Rating Data

## Cobined Data 1

In [4]:
with open('data/combined_data_1.txt', 'r') as file:
    lines = file.readlines()
    data = []
    for line in tqdm(lines):
        row = {}
        line = line.strip('\n')
        if line.endswith(':'):
            movieid = int(line[:-1])
            continue

        line = line.split(',')
        row['movie_id'] = movieid
        row['user_id'] = line[0]
        row['rating'] = line[1]
        row['date'] = line[2]

        data.append(row)

ratings = pd.DataFrame(data)
# ratings.tail()

100%|██████████| 24058263/24058263 [00:47<00:00, 511561.25it/s]


In [5]:
print(f'Total Movies in Dataset {len(ratings.movie_id.unique())}')
print(f'Total Users in Dataset {len(ratings.user_id.unique())}')

Total Movies in Dataset 4499
Total Users in Dataset 470758


## Combined Data 2

In [6]:
with open('data/combined_data_2.txt', 'r') as file:
    lines = file.readlines()
    data = []
    for line in tqdm(lines):
        row = {}
        line = line.strip('\n')
        if line.endswith(':'):
            movieid = int(line[:-1])
            continue
        
        line = line.split(',')
        row['movie_id'] = movieid
        row['user_id'] = line[0]
        row['rating'] = line[1]
        row['date'] = line[2]
        
        data.append(row)

ratings = ratings.append(pd.DataFrame(data), ignore_index=True)
# ratings.tail()

100%|██████████| 26982302/26982302 [01:00<00:00, 444785.43it/s]


In [7]:
print(f'Total Movies in Dataset {len(ratings.movie_id.unique())}')
print(f'Total Users in Dataset {len(ratings.user_id.unique())}')

Total Movies in Dataset 9210
Total Users in Dataset 478018


## Combined Data 3

In [8]:
with open('data/combined_data_3.txt', 'r') as file:
    lines = file.readlines()
    data = []
    for line in tqdm(lines):
        row = {}
        line = line.strip('\n')
        if line.endswith(':'):
            movieid = int(line[:-1])
            continue
        
        line = line.split(',')
        row['movie_id'] = movieid
        row['user_id'] = line[0]
        row['rating'] = line[1]
        row['date'] = line[2]
        
        data.append(row)

ratings = ratings.append(pd.DataFrame(data), ignore_index=True)
# ratings.tail()

100%|██████████| 22605786/22605786 [03:42<00:00, 101824.62it/s]


In [9]:
print(f'Total Movies in Dataset {len(ratings.movie_id.unique())}')
print(f'Total Users in Dataset {len(ratings.user_id.unique())}')

Total Movies in Dataset 13367
Total Users in Dataset 479453


## Combined Data 4

In [10]:
with open('data/combined_data_4.txt', 'r') as file:
    lines = file.readlines()
    data = []
    for line in tqdm(lines):
        row = {}
        line = line.strip('\n')
        if line.endswith(':'):
            movieid = int(line[:-1])
            continue
        
        line = line.split(',')
        row['movie_id'] = movieid
        row['user_id'] = line[0]
        row['rating'] = line[1]
        row['date'] = line[2]
        
        data.append(row)

ratings = ratings.append(pd.DataFrame(data), ignore_index=True)
# ratings.tail()

100%|██████████| 26851926/26851926 [03:25<00:00, 130903.82it/s]


In [11]:
print(f'Total Movies in Dataset {len(ratings.movie_id.unique())}')
print(f'Total Users in Dataset {len(ratings.user_id.unique())}')

Total Movies in Dataset 17770
Total Users in Dataset 480189


In [12]:
print(f'Shape of the ratings data: {ratings.shape}')

Shape of the ratings data: (100480507, 4)


In [13]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 100480507 entries, 0 to 100480506
Data columns (total 4 columns):
 #   Column    Dtype 
---  ------    ----- 
 0   movie_id  int64 
 1   user_id   object
 2   rating    object
 3   date      object
dtypes: int64(1), object(3)
memory usage: 3.0+ GB


In [14]:
ratings = ratings.merge(movies, on='movie_id', how='inner')
ratings.head()

Unnamed: 0,movie_id,user_id,rating,date,release_year,movie_name
0,1,1488844,3,2005-09-06,2003,Dinosaur Planet
1,1,822109,5,2005-05-13,2003,Dinosaur Planet
2,1,885013,4,2005-10-19,2003,Dinosaur Planet
3,1,30878,4,2005-12-26,2003,Dinosaur Planet
4,1,823519,3,2004-05-03,2003,Dinosaur Planet


In [15]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100480507 entries, 0 to 100480506
Data columns (total 6 columns):
 #   Column        Dtype 
---  ------        ----- 
 0   movie_id      int64 
 1   user_id       object
 2   rating        object
 3   date          object
 4   release_year  object
 5   movie_name    object
dtypes: int64(1), object(5)
memory usage: 5.2+ GB


In [None]:
# ratings['user_id'] = ratings['user_id'].astype(int)
# ratings['rating'] = ratings['rating'].astype(int)
# ratings['date'] = pd.to_datetime(ratings["date"])
# ratings['release_year'] = ratings['release_year'].astype(int)

In [18]:
ratings['user_id'] = ratings['user_id'].astype(int)
ratings['rating'] = ratings['rating'].astype(int)
ratings['date'] = pd.to_datetime(ratings["date"])
# ratings['release_year'] = ratings['release_year'].astype(int)
ratings['rating_year'] = ratings["date"].dt.year
ratings['rating_day'] = ratings["date"].dt.strftime('%A')

In [19]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 100480507 entries, 0 to 100480506
Data columns (total 8 columns):
 #   Column        Dtype         
---  ------        -----         
 0   movie_id      int64         
 1   user_id       int64         
 2   rating        int64         
 3   date          datetime64[ns]
 4   release_year  object        
 5   movie_name    object        
 6   rating_year   int64         
 7   rating_day    object        
dtypes: datetime64[ns](1), int64(4), object(3)
memory usage: 6.7+ GB


In [20]:
test_data = ratings.head(20)
test_data.to_csv('data/test_data.csv', index=False)

In [21]:
ratings.to_csv('data/ratings.csv', index=False)

# Create Sparse Matrix

In [22]:
sparse_data = sp.csr_matrix((ratings.rating, (ratings.user_id, ratings.movie_id)))
sp.save_npz('data/sparse_data.npz', sparse_data)
print(f'The shape of Sparse Matrix is: {sparse_data.shape}')

The shape of Sparse Matrix is: (2649430, 17771)


In [24]:
data[:10]

[{'movie_id': 13368,
  'user_id': '2385003',
  'rating': '4',
  'date': '2004-07-08'},
 {'movie_id': 13368, 'user_id': '659432', 'rating': '3', 'date': '2005-03-16'},
 {'movie_id': 13368, 'user_id': '751812', 'rating': '2', 'date': '2002-12-16'},
 {'movie_id': 13368,
  'user_id': '2625420',
  'rating': '2',
  'date': '2004-05-25'},
 {'movie_id': 13368,
  'user_id': '1650301',
  'rating': '1',
  'date': '2005-08-30'},
 {'movie_id': 13368,
  'user_id': '2269227',
  'rating': '4',
  'date': '2005-10-27'},
 {'movie_id': 13368,
  'user_id': '2220672',
  'rating': '4',
  'date': '2002-08-19'},
 {'movie_id': 13368,
  'user_id': '2500511',
  'rating': '4',
  'date': '2003-08-11'},
 {'movie_id': 13368,
  'user_id': '1452058',
  'rating': '2',
  'date': '2005-01-29'},
 {'movie_id': 13368,
  'user_id': '1624891',
  'rating': '3',
  'date': '2002-07-27'}]