In [2]:
import pandas as pd

### Read in and explore the the ratings data

In [3]:
# Reading in the rating file
with open('ml-10M100K/ratings.dat','r') as f:
    next(f) # skip first row
    df = pd.DataFrame(l.rstrip().split('::') for l in f)

In [4]:
df.head(3)

Unnamed: 0,0,1,2,3
0,1,185,5,838983525
1,1,231,5,838983392
2,1,292,5,838983421


In [5]:
# Dropping the time stamp 
df = df.drop(columns = 3)

In [6]:
df.columns

Int64Index([0, 1, 2], dtype='int64')

In [7]:
# renaming columns
df = df.rename({0:'user_id', 1:'movie_id', 2:'rating'}, axis = 1)

In [8]:
df.head(3)

Unnamed: 0,user_id,movie_id,rating
0,1,185,5
1,1,231,5
2,1,292,5


In [9]:
# Checking for nulls 
df.isnull().sum()

user_id     0
movie_id    0
rating      0
dtype: int64

In [10]:
# Checking the size of our dataset
df.shape

(10000053, 3)

In [11]:
# checking datatypes
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000053 entries, 0 to 10000052
Data columns (total 3 columns):
user_id     object
movie_id    object
rating      object
dtypes: object(3)
memory usage: 228.9+ MB


In [12]:
# Changing rating to float and user_id/movie_id to an integer 
df['rating'] = df['rating'].astype(float)
df['user_id'] = df['user_id'].astype(int)
df['movie_id'] = df['movie_id'].astype(int)

In [13]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 10000053 entries, 0 to 10000052
Data columns (total 3 columns):
user_id     int64
movie_id    int64
rating      float64
dtypes: float64(1), int64(2)
memory usage: 228.9 MB


In [14]:
# Checking the distribution of ratings
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
user_id,10000053.0,35869.862996,20585.335259,1.0,18123.0,35741.0,53608.0,71567.0
movie_id,10000053.0,4120.291876,8938.402475,1.0,648.0,1834.0,3624.0,65133.0
rating,10000053.0,3.512422,1.060418,0.5,3.0,4.0,4.0,5.0


    Due to the size of the file we are going to drop half the users 

In [15]:
df = df[df['user_id']%3==0]

In [16]:
df.shape

(3348206, 3)

### Read the movies data and join it with our rating data

In [17]:
movies_df = pd.read_csv('ml-10M100K/movies.csv')

In [18]:
movies_df.head(3)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance


In [19]:
movies_df.shape

(27278, 3)

In [20]:
# drop the genre since I will not be using that columns
movies_df = movies_df.drop(columns='genres')

In [21]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 2 columns):
movieId    27278 non-null int64
title      27278 non-null object
dtypes: int64(1), object(1)
memory usage: 426.3+ KB


In [22]:
# Changing the data type of movie id to int for a join with the ratings dataset
movies_df['movieId'] = movies_df['movieId'].astype(int)

### Joining the rating and movies

In [23]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 3348206 entries, 41 to 9999950
Data columns (total 3 columns):
user_id     int64
movie_id    int64
rating      float64
dtypes: float64(1), int64(2)
memory usage: 102.2 MB


In [24]:
# Left joining the movies to the the rating data
final_df = pd.merge(df, movies_df, 
                    left_on = 'movie_id',
                    right_on = 'movieId',
                    how = 'left')

In [25]:
final_df.head()

Unnamed: 0,user_id,movie_id,rating,movieId,title
0,3,110,4.5,110.0,Braveheart (1995)
1,3,151,4.5,151.0,Rob Roy (1995)
2,3,213,5.0,213.0,Burnt by the Sun (Utomlyonnye solntsem) (1994)
3,3,590,3.5,590.0,Dances with Wolves (1990)
4,3,1148,4.0,1148.0,Wallace & Gromit: The Wrong Trousers (1993)


In [26]:
# Dropping the movie_ids as they are no longer necessary
final_df = final_df.drop(columns=['movie_id','movieId'])

In [27]:
# double checking final shape
print(df.shape)
print(final_df.shape)

(3348206, 3)
(3348206, 3)


In [28]:
# Checking for null values
final_df.isnull().sum() 

user_id     0
rating      0
title      13
dtype: int64

In [29]:
# I am going to drop the N/As since it's a small percentage of our dataset
final_df = final_df.dropna()

In [30]:
final_df.head(3)

Unnamed: 0,user_id,rating,title
0,3,4.5,Braveheart (1995)
1,3,4.5,Rob Roy (1995)
2,3,5.0,Burnt by the Sun (Utomlyonnye solntsem) (1994)


## User Reviews Based Recommendations

user based -- looking for movies that other users have watched that you have not yet: users as rows and movies as columns: based on your profile you will like this


item based -- item with similar user movies rows and users as columns: when you go to a movie or book and the remmendation there is normally item based. people who bought this also bought this

In [102]:
item_based_df = pd.pivot_table(final_df, values = 'rating' , index = 'title', columns = 'user_id')

MemoryError: 

In [97]:
item_based_df.head()

user_id,3,6,9,12,18,24,27,30,33,36,...,71535,71538,71541,71544,71547,71550,71553,71559,71562,71565
title,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
'Round Midnight (1986),,,,,,,,,,,...,,,,,,,,,,
'Til There Was You (1997),,,,,,,,,,,...,,,,,,,,,,
"'burbs, The (1989)",,,,,2.5,,,,,,...,,,,,,,2.5,,,
'night Mother (1986),,,,,,,,,,,...,,,,,,,,,,
*batteries not included (1987),,,,,,,,,,,...,,,,,,,,,,


    Exporting the data for other notebooks

In [98]:
item_based_df.to_csv('data/item_based.csv')

In [31]:
user_based_df = pd.pivot_table(final_df, values = 'rating' , index = 'user_id', columns = 'title')
user_based_df.head(3)

title,'Round Midnight (1986),'Til There Was You (1997),"'burbs, The (1989)",'night Mother (1986),*batteries not included (1987),"...All the Marbles (California Dolls, The) (1981)",...And God Spoke (1993),...And Justice for All (1979),1-900 (06) (1994),10 (1979),...,"Zorro, the Gay Blade (1981)",Zulu (1964),Zus & Zo (2001),[REC] (2007),eXistenZ (1999),loudQUIETloud: A Film About the Pixies (2006),xXx (2002),xXx: State of the Union (2005),¡Three Amigos! (1986),À nous la liberté (Freedom for Us) (1931)
user_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
3,,,,,,,,,,,...,,,,,,,,,,
6,,,,,,,,,,,...,,,,,,,,,,
9,,,,,,,,,,,...,,,,,,,,,,


In [32]:
user_based_df.to_csv('data/movie_based.csv')