# Movie Predictor
### This project is dedicated to the legendary action hero Steven Seagal.
![title](steven.jpg)
## What does it do?
### Predict
- Average rating of movies on IMDb
### Using
- Directors
- Writers
- Actors
- Cinematographers
- Composers

### Extract the data

In [6]:
import pandas as pd

cols = ["tconst", "averageRating"]
titleAndRating = pd.read_csv('../title.ratings.tsv', delimiter='\t', usecols=cols)

cols = ["tconst", "titleType", "primaryTitle"]
titleTypeAndName = pd.read_csv('../title.basics.tsv', delimiter='\t', usecols=cols)

cols = ["tconst", "nconst", "category"]
movieRoles = pd.read_csv('../title.principles.tsv', delimiter='\t', usecols=cols)

cols = ["nconst", "primaryName"]
names = pd.read_csv('../name.basics.tsv', delimiter='\t', usecols=cols)

### Clean the data so we only have
- Movies
- TV Movies
- Shorts

### And we only need the
- Directors
- Writers
- Actors
- Cinematographers
- Composers

In [7]:

titleTypeAndName = titleTypeAndName.loc[(titleTypeAndName["titleType"] == "short") | (titleTypeAndName["titleType"] == "tvMovie") | (titleTypeAndName["titleType"] == "movie") ]
titleTypeAndName.reset_index(inplace=True,drop=True)
movieRoles = movieRoles.loc[(movieRoles["category"] == "self") | (movieRoles["category"] == "actor") | (movieRoles["category"] == "actress") | (movieRoles["category"] == "director") | (movieRoles["category"] == "writer") | (movieRoles["category"] == "cinematographer") | (movieRoles["category"] == "composer")]
movieRoles.reset_index(inplace=True,drop=True) #Need to reset the index

### Assign the names for convinience

In [9]:
cols = ["nconst"]
movieRoles = movieRoles.join(names.set_index(cols), on = cols)
movieRoles

Unnamed: 0,tconst,nconst,category,primaryName
0,tt0000001,nm1588970,self,Carmencita
1,tt0000001,nm0005690,director,William K.L. Dickson
2,tt0000001,nm0374658,cinematographer,William Heise
3,tt0000002,nm0721526,director,Émile Reynaud
4,tt0000002,nm1335271,composer,Gaston Paulin
...,...,...,...,...
42219420,tt9916880,nm0286175,actor,Wayne Forester
42219421,tt9916880,nm10535738,actress,Eden Gamliel
42219422,tt9916880,nm0996406,director,Hilary Audus
42219423,tt9916880,nm1482639,writer,Lucinda Whiteley


### Separate the roles

In [25]:
actors = movieRoles[  (movieRoles['category'] == "actor") | (movieRoles['category'] == "actress") | (movieRoles['category'] == "self")]
actors.reset_index(inplace=True,drop=True)

composers = movieRoles[  (movieRoles['category'] == "composer")]
composers.reset_index(inplace=True,drop=True)

directors = movieRoles[  (movieRoles['category'] == "director")]
directors.reset_index(inplace=True,drop=True)

cinematographers = movieRoles[  (movieRoles['category'] == "cinematographer")]
cinematographers.reset_index(inplace=True,drop=True)

writers = movieRoles[  (movieRoles['category'] == "writer")]
writers.reset_index(inplace=True,drop=True)


### Make the new dataset
- titleType, primaryTitle, Directors, Writers, Actors, Cinematographers, Composers, rating

In [10]:
movieData = pd.DataFrame(columns = ['tconst'])
#movieData = pd.DataFrame(columns = ['tconst','titleType', 'primaryTitle', 'director', 'writer', 'actor', 'cinematographer', 'composer', 'averageRating'])

movieData['tconst'] = titleTypeAndName['tconst'].copy()

cols = ["tconst"]
movieData = movieData.join(titleTypeAndName.set_index(cols), on = cols)
movieData = movieData.join(titleAndRating.set_index(cols), on = cols)
movieData = movieData.dropna( how='all', subset=['averageRating'])
movieData.reset_index(inplace=True,drop=True)
movieData


Unnamed: 0,tconst,titleType,primaryTitle,averageRating
0,tt0000001,short,Carmencita,5.7
1,tt0000002,short,Le clown et ses chiens,6.0
2,tt0000003,short,Pauvre Pierrot,6.5
3,tt0000004,short,Un bon bock,6.0
4,tt0000005,short,Blacksmith Scene,6.2
...,...,...,...,...
455850,tt9916428,movie,The Secret of China,3.8
455851,tt9916460,tvMovie,Pink Taxi,9.3
455852,tt9916538,movie,Kuambil Lagi Hatiku,8.3
455853,tt9916544,short,My Sweet Prince,7.1
