In [40]:
import pandas as pd

#### Movies

In [4]:
movies_df = pd.read_csv('ml-20m/movies.csv')

In [5]:
# provides information movieId, title, and genres of movies
# genres will need to be split up into seperate features for each individual movie
movies_df.head()

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [6]:
movies_df.shape

(27278, 3)

In [7]:
movies_df.isnull().sum()

movieId    0
title      0
genres     0
dtype: int64

In [8]:
movies_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 27278 entries, 0 to 27277
Data columns (total 3 columns):
movieId    27278 non-null int64
title      27278 non-null object
genres     27278 non-null object
dtypes: int64(1), object(2)
memory usage: 639.4+ KB


In [9]:
movies_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
movieId,27278.0,59855.48057,44429.314697,1.0,6931.25,68068.0,100293.25,131262.0


#### Ratings 

In [72]:
ratings_df = pd.read_csv('ml-20m/ratings.csv')

In [73]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,2,3.5,1112486027
1,1,29,3.5,1112484676
2,1,32,3.5,1112484819
3,1,47,3.5,1112484727
4,1,50,3.5,1112484580


In [10]:
ratings_df.isnull().sum()

userId       0
movieId      0
rating       0
timestamp    0
dtype: int64

In [11]:
ratings_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 20000263 entries, 0 to 20000262
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 610.4 MB


In [12]:
ratings_df.shape

(20000263, 4)

In [13]:
ratings_df = ratings_df.drop(columns='timestamp')

In [14]:
ratings_df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
userId,20000263.0,69045.872583,40038.626653,1.0,34395.0,69141.0,103637.0,138493.0
movieId,20000263.0,9041.56733,19789.477445,1.0,902.0,2167.0,4770.0,131262.0
rating,20000263.0,3.525529,1.051989,0.5,3.0,3.5,4.0,5.0


#### Tags

In [10]:
tags_df = pd.read_csv('ml-20m/tags.csv')

In [11]:
tags_df.head()

Unnamed: 0,userId,movieId,tag,timestamp
0,18,4141,Mark Waters,1240597180
1,65,208,dark hero,1368150078
2,65,353,dark hero,1368150079
3,65,521,noir thriller,1368149983
4,65,592,dark hero,1368150078


In [12]:
tags_df.shape

(465564, 4)

In [13]:
tags_df.isnull().sum()

userId        0
movieId       0
tag          16
timestamp     0
dtype: int64

In [14]:
tags_df[tags_df['tag'].isnull()]

Unnamed: 0,userId,movieId,tag,timestamp
373276,116460,123,,1199450867
373277,116460,346,,1199451946
373281,116460,1184,,1199452261
373288,116460,1785,,1199452006
373289,116460,2194,,1199450677
373291,116460,2691,,1199451002
373299,116460,4103,,1199451920
373301,116460,4473,,1199451040
373303,116460,4616,,1199452441
373319,116460,7624,,1199452266


In [15]:
# We are going to replace the null values with "missing"
tags_df = tags_df['tag'].fillna('missing')

#### Links

In [16]:
links_df = pd.read_csv('ml-20m/links.csv')

In [17]:
# links between other tables will be necessary for joins later

# imdb id for actor / actress data
# imdbpy 

links_df.head()

Unnamed: 0,movieId,imdbId,tmdbId
0,1,114709,862.0
1,2,113497,8844.0
2,3,113228,15602.0
3,4,114885,31357.0
4,5,113041,11862.0


In [18]:
links_df.shape

(27278, 3)

#### Genome Tags & Scores

In [19]:
genome_scores_df = pd.read_csv('ml-20m/genome-scores.csv')

In [20]:
genome_scores_df.head()

Unnamed: 0,movieId,tagId,relevance
0,1,1,0.025
1,1,2,0.025
2,1,3,0.05775
3,1,4,0.09675
4,1,5,0.14675


In [21]:
genome_scores_df.shape

(11709768, 3)

In [22]:
genome_scores_df.isnull().sum()

movieId      0
tagId        0
relevance    0
dtype: int64

In [23]:
genome_scores_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 11709768 entries, 0 to 11709767
Data columns (total 3 columns):
movieId      int64
tagId        int64
relevance    float64
dtypes: float64(1), int64(2)
memory usage: 268.0 MB


In [24]:
genome_tags_df = pd.read_csv('ml-20m/genome-tags.csv')

In [25]:
genome_tags_df.head()

Unnamed: 0,tagId,tag
0,1,007
1,2,007 (series)
2,3,18th century
3,4,1920s
4,5,1930s


In [26]:
genome_tags_df.shape

(1128, 2)

In [27]:
genome_tags_df.isnull().sum()

tagId    0
tag      0
dtype: int64

In [28]:
genome_tags_df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1128 entries, 0 to 1127
Data columns (total 2 columns):
tagId    1128 non-null int64
tag      1128 non-null object
dtypes: int64(1), object(1)
memory usage: 17.7+ KB


In [29]:
genome_df = pd.merge(genome_tags_df, genome_scores_df, left_on = 'tagId', right_on= 'tagId')

In [30]:
# this provides each movie with grat amount of details
genome_df.sort_values(by = 'movieId').head()

Unnamed: 0,tagId,tag,movieId,relevance
0,1,007,1,0.025
1183434,115,based on true story,1,0.2245
5875646,567,islam,1,0.054
10993479,1060,united nations,1,0.0095
8470896,817,prohibition,1,0.031


In [31]:
genome_df.to_csv('ml-20m/genome_clean.csv', index = False)

## Movie Cleanings

    Creating features from movies dataframe

In [32]:
movies_df.head(5)

Unnamed: 0,movieId,title,genres
0,1,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy
1,2,Jumanji (1995),Adventure|Children|Fantasy
2,3,Grumpier Old Men (1995),Comedy|Romance
3,4,Waiting to Exhale (1995),Comedy|Drama|Romance
4,5,Father of the Bride Part II (1995),Comedy


In [33]:
movies_df['year'] = movies_df['title'].map(lambda x : x.split('(')[-1][:4])#.astype(int)

In [34]:
# look up all the years that were missing in the format
change_needed = []
for i in movies_df['year']:
    try:
        x = int(i)
    except:
        change_needed.append(i)
change_needed

['Baby',
 'Das ',
 'Bici',
 'Braz',
 'Slay',
 'Tato',
 'Nati',
 'The ',
 'In O',
 'Step',
 'Two:',
 "Li'l",
 'A Ye',
 'Body',
 'Pols',
 'The ',
 'My O',
 'Movi',
 'Mich']

In [35]:
# print the titles of the movies with missing years
for i in movies_df['year']:
    if i in change_needed:
        print(movies_df[movies_df['year']==i][['title','year']])

           title  year
10593  Babylon 5  Baby
                                         title  year
15646  Millions Game, The (Das Millionenspiel)  Das 
                                                  title  year
17341  Bicycle, Spoon, Apple (Bicicleta, cullera, poma)  Bici
                                       title  year
23617  Brazil: In the Shadow of the Stadiums  Braz
                    title  year
23824  Slaying the Badger  Slay
                            title  year
24286  Tatort: Im Schmerz geboren  Tato
                                     title  year
24412  National Theatre Live: Frankenstein  Nati
                                      title  year
26115  The Court-Martial of Jackie Robinson  The 
26963      The Third Reich: The Rise & Fall  The 
               title  year
26127  In Our Garden  In O
                                    title  year
26180  Stephen Fry In America - New World  Step
                                title  year
26335  Two: The Story of Roman & Nyr

In [36]:
movies_df[movies_df['year']=='Baby'] = 1993
movies_df[movies_df['year']=='Das '] = 1970
movies_df[movies_df['year']=='Bici'] = 2010
movies_df[movies_df['year']=='Braz'] = 2014
movies_df[movies_df['year']=='Slay'] = 2014
movies_df[movies_df['year']=='Tato'] = 2014
movies_df[movies_df['year']=='Nati'] = 2011
movies_df[movies_df['year']=='The '] = 1990
movies_df[movies_df['year']=='In O'] = 2002
movies_df[movies_df['year']=='Step'] = 2008
movies_df[movies_df['year']=='Two:'] = 2008
movies_df[movies_df['year']=="Li'l"] = 2014
movies_df[movies_df['year']=='A Ye'] = 1991
movies_df[movies_df['year']=='Body'] = 1993
movies_df[movies_df['year']=='Pols'] = 2014
movies_df[movies_df['year']=='The '] = 1993
movies_df[movies_df['year']=='My O'] = 2014
movies_df[movies_df['year']=='Movi'] = 2003
movies_df[movies_df['year']=='Mich'] = 1993

In [37]:
# Converting  
movies_df['year'] = movies_df['year'].astype(int)

In [39]:
movies_df.to_csv('ml-20m/movies_cleaned.csv', index = False)

    Create a dataframe describing genome_df

## User Reviews Based Recommendations

user based -- looking for movies that other users have watched that you have not yet: users as rows and movies as columns: based on your profile you will like this


item based -- item with similar user movies rows and users as columns: when you go to a movie or book and the remmendation there is normally item based. people who bought this also bought this

movies rows and users as columns

In [242]:
ratings_df.head()

Unnamed: 0,userId,movieId,rating
0,1,2,3.5
1,1,29,3.5
2,1,32,3.5
3,1,47,3.5
4,1,50,3.5


In [None]:
user_based_df = pd.pivot_table(ratings_df, values = 'rating' , index = 'userId', columns = 'movieId')

In [None]:
movie_based_df = pd.pivot_table(ratings_df, values = 'rating' , index = 'movieId', columns = 'userId')