# Movie Predictor
### This project is dedicated to the legendary action hero Steven Seagal.
![title](steven.jpg)
## What does it do?
### Predict
- Average rating of movies on IMDb
### Using
- Directors
- Writers
- Actors
- Cinematographers
- Composers

### Extract the data

In [1]:
import pandas as pd

cols = ["tconst", "averageRating"]
titleAndRating = pd.read_csv('../title.ratings.tsv', delimiter='\t', usecols=cols)

cols = ["tconst", "titleType", "primaryTitle"]
titleTypeAndName = pd.read_csv('../title.basics.tsv', delimiter='\t', usecols=cols)

cols = ["tconst", "nconst", "category"]
movieRoles = pd.read_csv('../title.principles.tsv', delimiter='\t', usecols=cols)

cols = ["nconst", "primaryName"]
names = pd.read_csv('../name.basics.tsv', delimiter='\t', usecols=cols)

### Clean the data so we only have
- Movies
- TV Movies
- Shorts

### And we only need the
- Directors
- Writers
- Actors
- Cinematographers
- Composers

In [2]:

titleTypeAndName = titleTypeAndName.loc[(titleTypeAndName["titleType"] == "short") | (titleTypeAndName["titleType"] == "tvMovie") | (titleTypeAndName["titleType"] == "movie") ]
titleTypeAndName.reset_index(inplace=True,drop=True)
movieRoles = movieRoles.loc[(movieRoles["category"] == "self") | (movieRoles["category"] == "actor") | (movieRoles["category"] == "actress") | (movieRoles["category"] == "director") | (movieRoles["category"] == "writer") | (movieRoles["category"] == "cinematographer") | (movieRoles["category"] == "composer")]
movieRoles.reset_index(inplace=True,drop=True) #Need to reset the index

### Assign the names for convinience

In [3]:
cols = ["nconst"]
movieRoles = movieRoles.join(names.set_index(cols), on = cols)
movieRoles

Unnamed: 0,tconst,nconst,category,primaryName
0,tt0000001,nm1588970,self,Carmencita
1,tt0000001,nm0005690,director,William K.L. Dickson
2,tt0000001,nm0374658,cinematographer,William Heise
3,tt0000002,nm0721526,director,Émile Reynaud
4,tt0000002,nm1335271,composer,Gaston Paulin
...,...,...,...,...
42219420,tt9916880,nm0286175,actor,Wayne Forester
42219421,tt9916880,nm10535738,actress,Eden Gamliel
42219422,tt9916880,nm0996406,director,Hilary Audus
42219423,tt9916880,nm1482639,writer,Lucinda Whiteley


### Separate the roles

In [15]:
actors = movieRoles[  (movieRoles['category'] == "actor") | (movieRoles['category'] == "actress") | (movieRoles['category'] == "self")]
actors.reset_index(inplace=True,drop=True)
actors.drop(["nconst", "category"], axis=1, inplace=True)
actors.rename(columns = {"primaryName":"actors"}, inplace=True)
actors["actors"] = actors["actors"].str.lower()
actors["actors"] = actors["actors"].str.replace(' ', '')

composers = movieRoles[  (movieRoles['category'] == "composer")]
composers.reset_index(inplace=True,drop=True)
composers.drop(["nconst", "category"], axis = 1, inplace=True)
composers.rename(columns = {"primaryName":"composers"}, inplace=True)
composers["composers"] = composers["composers"].str.lower()
composers["composers"] = composers["composers"].str.replace(' ', '')

directors = movieRoles[  (movieRoles['category'] == "director")]
directors.reset_index(inplace=True,drop=True)
directors.drop(["nconst", "category"], axis=1, inplace=True)
directors.rename(columns = {"primaryName":"directors"}, inplace=True)
directors["directors"] = directors["directors"].str.lower()
directors["directors"] = directors["directors"].str.replace(' ', '')

cinematographers = movieRoles[  (movieRoles['category'] == "cinematographer")]
cinematographers.reset_index(inplace=True,drop=True)
cinematographers.drop(["nconst", "category"], axis=1, inplace=True)
cinematographers.rename(columns = {"primaryName":"cinematographers"}, inplace=True)
cinematographers["cinematographers"] = cinematographers["cinematographers"].str.lower()
cinematographers["cinematographers"] = cinematographers["cinematographers"].str.replace(' ', '')

writers = movieRoles[  (movieRoles['category'] == "writer")]
writers.reset_index(inplace=True,drop=True)
writers.drop(["nconst", "category"], axis=1, inplace=True)
writers.rename(columns = {"primaryName":"writers"}, inplace=True)
writers["writers"] = writers["writers"].str.lower()
writers["writers"] = writers["writers"].str.replace(' ', '')


A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().drop(
A value is trying to be set on a copy of a slice from a DataFrame

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  return super().rename(
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  actors["actors"] = actors["actors"].str.lower()
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.

Unnamed: 0,tconst,writers
0,tt0000036,washingtonirving
1,tt0000076,washingtonirving
2,tt0000108,washingtonirving
3,tt0000109,washingtonirving
4,tt0000110,washingtonirving
...,...,...
6380410,tt9916852,birolgüven
6380411,tt9916852,aysengünsuteker
6380412,tt9916856,johanplanefeldt
6380413,tt9916880,lucindawhiteley


### Make the new dataset
- titleType, primaryTitle, Directors, Writers, Actors, Cinematographers, Composers, rating

In [10]:
movieData = pd.DataFrame(columns = ['tconst'])
#movieData = pd.DataFrame(columns = ['tconst','titleType', 'primaryTitle', 'director', 'writer', 'actor', 'cinematographer', 'composer', 'averageRating'])

movieData['tconst'] = titleTypeAndName['tconst'].copy()

cols = ["tconst"]
movieData = movieData.join(titleTypeAndName.set_index(cols), on = cols)
movieData = movieData.join(titleAndRating.set_index(cols), on = cols)
movieData = movieData.dropna( how='all', subset=['averageRating'])
movieData.reset_index(inplace=True,drop=True)
movieData


Unnamed: 0,tconst,titleType,primaryTitle,averageRating
0,tt0000001,short,Carmencita,5.7
1,tt0000002,short,Le clown et ses chiens,6.0
2,tt0000003,short,Pauvre Pierrot,6.5
3,tt0000004,short,Un bon bock,6.0
4,tt0000005,short,Blacksmith Scene,6.2
...,...,...,...,...
455850,tt9916428,movie,The Secret of China,3.8
455851,tt9916460,tvMovie,Pink Taxi,9.3
455852,tt9916538,movie,Kuambil Lagi Hatiku,8.3
455853,tt9916544,short,My Sweet Prince,7.1


### Let's add some professions

In [11]:
movieDataTemp = movieData.join(directors.set_index(cols), on = cols)
movieDataTemp = movieDataTemp.join(writers.set_index(cols), on = cols)
movieDataTemp = movieDataTemp.join(actors.set_index(cols), on = cols)
movieDataTemp = movieDataTemp.join(composers.set_index(cols), on = cols)
movieDataTemp = movieDataTemp.join(cinematographers.set_index(cols), on = cols)

#output = movieDataTemp.groupby(['tconst']).agg(lambda x: set(x))
output = movieDataTemp.groupby('tconst', as_index=False).agg(lambda x: ', '.join(set(x.astype(str))))
output

#output['titleType'] = output['titleType'].apply(lambda x: ', '.join(set([y.strip() for y in x.split(',')])))


output.to_csv("../movieData2.tsv", sep="\t")

### Clean the data so there's no NaN-values

In [12]:
import numpy as np
cleanedData = output.replace('nan', np.nan)
#cleanedData = cleanedData.dropna()
#cleanedData = cleanedData.reset_index(drop=True)
cleanedData.drop('tconst', axis=1, inplace=True) #Don't need tconst any longer
cleanedData.drop('primaryTitle', axis=1, inplace=True) #Don't need tconst any longer

cleanedData['averageRating'] = pd.to_numeric(cleanedData['averageRating'], errors='coerce')
cleanedData.to_csv("../movieDB.tsv", sep="\t")

### Make training and testing set

In [1]:
from sklearn.model_selection import train_test_split

cleanedData = pd.read_csv('../movieDB.tsv')

x_train, x_test = train_test_split(cleanedData, test_size=0.2)

#Split dependant and indemepndant variable
y_train = pd.DataFrame(columns = ['averageRating'])
x_train.drop('averageRating', axis=1, inplace=True)

y_test = pd.DataFrame(columns = ['averageRating'])
x_test.drop('averageRating', axis=1, inplace=True)

NameError: name 'pd' is not defined

### Do some learning

In [14]:
from sklearn import linear_model
from sklearn.feature_extraction import DictVectorizer

vectorizer = DictVectorizer()
vector_data = vectorizer.fit_transform(x_train)

regr = linear_model.LinearRegression()
#regr.fit(x_train, y_train)

AttributeError: 'str' object has no attribute 'items'