#### Authors: Tanmay Sawaji, Vishwas Desai

This file connverts director, actor and genre to binary attributes.
For eg: A movie can have 'action' and 'drama' in the genre attribute, so two new attributes will be created named 'action' and 'drama'

In [1]:
import numpy as np
import pandas as pd
from sklearn.preprocessing import StandardScaler


In [2]:
df = pd.read_csv("normalized_data.csv")
df.head()

Unnamed: 0.1,Unnamed: 0,movieId,imdbId,"('rating', 'Average Rating')",title,genres,budget,box_office,director,actors,positive_rate
0,0,1,114709,3.92093,Toy Story (1995),Adventure|Animation|Children|Comedy|Fantasy,0.001,1.339881e-07,John Lasseter,Tom Hanks;Tim Allen;Don Rickles;Jim Varney;Wal...,0.958333
1,1,2,113497,3.431818,Jumanji (1995),Adventure|Children|Fantasy,0.001666,9.426131e-08,Joe Johnston,Robin Williams;Jonathan Hyde;Kirsten Dunst;Bra...,1.0
2,2,4,114885,2.357143,Waiting to Exhale (1995),Comedy|Drama|Romance,0.000533,2.921562e-08,Forest Whitaker,Whitney Houston;Angela Bassett;Loretta Devine;...,0.2
3,3,6,113277,3.946078,Heat (1995),Action|Crime|Thriller,0.002,4.304214e-08,Michael Mann,Al Pacino;Robert De Niro;Val Kilmer;Jon Voight...,0.76
4,4,7,114319,3.185185,Sabrina (1995),Comedy|Romance,0.001933,3.131809e-08,Sydney Pollack,Harrison Ford;Julia Ormond;Greg Kinnear;Nancy ...,0.88


In [3]:
df.drop('Unnamed: 0', axis = 1, inplace = True)
df.drop('title', axis = 1, inplace = True)
df.drop('movieId', axis = 1, inplace = True)
df.drop('imdbId', axis = 1, inplace = True)
df.head()

Unnamed: 0,"('rating', 'Average Rating')",genres,budget,box_office,director,actors,positive_rate
0,3.92093,Adventure|Animation|Children|Comedy|Fantasy,0.001,1.339881e-07,John Lasseter,Tom Hanks;Tim Allen;Don Rickles;Jim Varney;Wal...,0.958333
1,3.431818,Adventure|Children|Fantasy,0.001666,9.426131e-08,Joe Johnston,Robin Williams;Jonathan Hyde;Kirsten Dunst;Bra...,1.0
2,2.357143,Comedy|Drama|Romance,0.000533,2.921562e-08,Forest Whitaker,Whitney Houston;Angela Bassett;Loretta Devine;...,0.2
3,3.946078,Action|Crime|Thriller,0.002,4.304214e-08,Michael Mann,Al Pacino;Robert De Niro;Val Kilmer;Jon Voight...,0.76
4,3.185185,Comedy|Romance,0.001933,3.131809e-08,Sydney Pollack,Harrison Ford;Julia Ormond;Greg Kinnear;Nancy ...,0.88


In [4]:
unique_genres = []
unique_directors = []
unique_actors = []
for i in range(len(df)):
    genres = df.at[i, 'genres']
    for genre in genres.split('|'):
        if genre not in unique_genres:
            unique_genres.append(genre)
    directors = df.at[i, 'director']
    for director in directors.split(';'):
        if director not in unique_directors:
            unique_directors.append(director)
    actors = df.at[i, 'actors']
    try:
        actors = actors.split(';')[:3]
    except:
        actors = actors.split(';')
    new_actors = ""
    for actor in actors:
        new_actors += actor + ";"
        if actor not in unique_actors:
            unique_actors.append(actor)
    new_actors = new_actors.rstrip(";")
    df.at[i, 'actors'] = new_actors
print(len(unique_genres))
print(len(unique_directors))
print(len(unique_actors))

20
1353
3759


In [5]:
total_values = len(df)
for genre in unique_genres:
    df[genre] = [0] * total_values
for director in unique_directors:
    df[director] = [0] * total_values
for actor in unique_actors:
    df[actor] = [0] * total_values
    
for i in range(len(df)):
    genres = df.at[i, 'genres']
    for genre in genres.split("|"):
        df.at[i, genre] = 1
    directors = df.at[i, 'director']
    for director in directors.split(';'):
        df.at[i, director] = 1
    for actor in actors:
        df.at[i, actor] = 1
df.head()

Unnamed: 0,"('rating', 'Average Rating')",genres,budget,box_office,director,actors,positive_rate,Adventure,Animation,Children,...,John Boyega,Cailee Spaeny,Rafe Spall,Sarah Vowell,Morena Baccarin,Joonas Suotamo,Lil Rel Howery,John David Washington,Isiah Whitlock Jr.,Amandla Stenberg
0,3.92093,Adventure|Animation|Children|Comedy|Fantasy,0.001,1.339881e-07,John Lasseter,Tom Hanks;Tim Allen;Don Rickles,0.958333,1,1,1,...,0,0,0,0,0,0,0,0,0,1
1,3.431818,Adventure|Children|Fantasy,0.001666,9.426131e-08,Joe Johnston,Robin Williams;Jonathan Hyde;Kirsten Dunst,1.0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
2,2.357143,Comedy|Drama|Romance,0.000533,2.921562e-08,Forest Whitaker,Whitney Houston;Angela Bassett;Loretta Devine,0.2,0,0,0,...,0,0,0,0,0,0,0,0,0,1
3,3.946078,Action|Crime|Thriller,0.002,4.304214e-08,Michael Mann,Al Pacino;Robert De Niro;Val Kilmer,0.76,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,3.185185,Comedy|Romance,0.001933,3.131809e-08,Sydney Pollack,Harrison Ford;Julia Ormond;Greg Kinnear,0.88,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [6]:
df.drop('genres', axis = 1, inplace = True)
df.drop('director', axis = 1, inplace = True)
df.drop('actors', axis = 1, inplace = True)
df.head()

Unnamed: 0,"('rating', 'Average Rating')",budget,box_office,positive_rate,Adventure,Animation,Children,Comedy,Fantasy,Drama,...,John Boyega,Cailee Spaeny,Rafe Spall,Sarah Vowell,Morena Baccarin,Joonas Suotamo,Lil Rel Howery,John David Washington,Isiah Whitlock Jr.,Amandla Stenberg
0,3.92093,0.001,1.339881e-07,0.958333,1,1,1,1,1,0,...,0,0,0,0,0,0,0,0,0,1
1,3.431818,0.001666,9.426131e-08,1.0,1,0,1,0,1,0,...,0,0,0,0,0,0,0,0,0,1
2,2.357143,0.000533,2.921562e-08,0.2,0,0,0,1,0,1,...,0,0,0,0,0,0,0,0,0,1
3,3.946078,0.002,4.304214e-08,0.76,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
4,3.185185,0.001933,3.131809e-08,0.88,0,0,0,1,0,0,...,0,0,0,0,0,0,0,0,0,1


In [7]:
df.to_csv("trainset.csv", index = False)