## Give Me The Next AAA Title
# Predictive Modeling
### Featuring Engineering of Languages Spoken
****

<br>
by Dustin Reyes
<br>
<br>
Prepared for:
<br>
Mynt (Globe Fintech Innovations, Inc.)
<br>
<br>

In [1]:
import os
import ast
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
movie_df = pd.read_csv('data2/combined_metadata.csv')
movie_df = movie_df.drop(['Unnamed: 0'], axis=1)
movie_df.head()

Unnamed: 0,adult,budget,genres,id,original_language,production_countries,revenue,runtime,spoken_languages,movieId,imdbId,tmdbId,views,likes,dislikes,n_trailers
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",373554033.0,81.0,"[{'iso_639_1': 'en', 'name': 'English'}]",1,114709,862.0,8050136,6330,1965,3
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",262797249.0,104.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",2,113497,8844.0,83048,70,14,1
2,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",81452156.0,127.0,"[{'iso_639_1': 'en', 'name': 'English'}]",4,114885,31357.0,104948,0,0,1
3,False,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",949,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",187436818.0,170.0,"[{'iso_639_1': 'en', 'name': 'English'}, {'iso...",6,113277,949.0,32419,119,1,1
4,False,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",9091,en,"[{'iso_3166_1': 'US', 'name': 'United States o...",64350171.0,106.0,"[{'iso_639_1': 'en', 'name': 'English'}]",9,114576,9091.0,6162,12,3,2


In [3]:
movie_df['genres'] = movie_df.apply(
    lambda row: ast.literal_eval(row.genres), axis=1)
movie_df['production_countries'] = movie_df.apply(
    lambda row: ast.literal_eval(row.production_countries), axis=1)
movie_df['spoken_languages'] = movie_df.apply(
    lambda row: ast.literal_eval(row.spoken_languages), axis=1)

In [4]:
def english(row):
    if row.spoken_languages:
        for lang in row.spoken_languages:
            if lang['name'] == 'English':
                return 1
    return 0


def numlang(row):
    if row.spoken_languages:
        return len(row.spoken_languages)


def listlangs(row):
    if row.spoken_languages:
        for lang in row.spoken_languages:
            if lang['name'] != 'English':
                print (lang['name'])

In [5]:
movie_df['numlang'] = movie_df.apply(numlang, axis=1)

In [6]:
one_hot = pd.get_dummies(movie_df['original_language'])

In [7]:
# remove obscure languages to avoid memorization
for col in one_hot:
    if sum(one_hot[col]) < 10:
        one_hot = one_hot.drop([col], 1)

In [8]:
movie_df = movie_df.join(one_hot)

In [9]:
movie_df = movie_df.drop(['original_language'], 1)
movie_df = movie_df.drop(['spoken_languages'], 1)

In [10]:
movie_df.columns

Index(['adult', 'budget', 'genres', 'id', 'production_countries', 'revenue',
       'runtime', 'movieId', 'imdbId', 'tmdbId', 'views', 'likes', 'dislikes',
       'n_trailers', 'numlang', 'cn', 'da', 'de', 'en', 'es', 'fr', 'hi', 'it',
       'ja', 'ko', 'ml', 'ru', 'ta', 'zh'],
      dtype='object')

In [11]:
movie_df.head()

Unnamed: 0,adult,budget,genres,id,production_countries,revenue,runtime,movieId,imdbId,tmdbId,...,es,fr,hi,it,ja,ko,ml,ru,ta,zh
0,False,30000000,"[{'id': 16, 'name': 'Animation'}, {'id': 35, '...",862,"[{'iso_3166_1': 'US', 'name': 'United States o...",373554033.0,81.0,1,114709,862.0,...,0,0,0,0,0,0,0,0,0,0
1,False,65000000,"[{'id': 12, 'name': 'Adventure'}, {'id': 14, '...",8844,"[{'iso_3166_1': 'US', 'name': 'United States o...",262797249.0,104.0,2,113497,8844.0,...,0,0,0,0,0,0,0,0,0,0
2,False,16000000,"[{'id': 35, 'name': 'Comedy'}, {'id': 18, 'nam...",31357,"[{'iso_3166_1': 'US', 'name': 'United States o...",81452156.0,127.0,4,114885,31357.0,...,0,0,0,0,0,0,0,0,0,0
3,False,60000000,"[{'id': 28, 'name': 'Action'}, {'id': 80, 'nam...",949,"[{'iso_3166_1': 'US', 'name': 'United States o...",187436818.0,170.0,6,113277,949.0,...,0,0,0,0,0,0,0,0,0,0
4,False,35000000,"[{'id': 28, 'name': 'Action'}, {'id': 12, 'nam...",9091,"[{'iso_3166_1': 'US', 'name': 'United States o...",64350171.0,106.0,9,114576,9091.0,...,0,0,0,0,0,0,0,0,0,0


In [12]:
# movie_df['imdbId'] = movie_df['imdbId'].astype(str)
movie_df['imdbId'] = movie_df['imdbId'].apply(lambda x: 'tt0' + str(x))

In [14]:
movie_df.to_csv('data2/languages_parsed.csv', index = False)