In [None]:
import numpy as np 
import pandas as pd 
import os
import json
import seaborn as sns 
import matplotlib.pyplot as plt
%matplotlib inline
plt.style.use('ggplot')
import lightgbm as lgb
import time
from wordcloud import WordCloud
import ast
import collections
from collections import Counter
import itertools
from sklearn.model_selection import train_test_split
from sklearn.decomposition import TruncatedSVD
from sklearn.preprocessing import Normalizer
from sklearn.pipeline import make_pipeline
import warnings
warnings.filterwarnings("ignore")

# Data overview & Feature Engineering

In [None]:
print(os.listdir("../input"))

In [None]:
train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')

In [None]:
train.head(3)

In [None]:
train.dropna().shape

In [None]:
train.info()

We begin to drop the `belongs_to_collection`and `homepage` features because there are too many `NaN`values.

In [None]:
train = train.drop(['belongs_to_collection', 'homepage'], axis=1)
test = test.drop(['belongs_to_collection', 'homepage'], axis=1)

### Useless Features
We drop some of the features that are not useful (at first glance)
* `imbd_id` : if we stick to the data that is provided, we don't need this id. Perhaps we could add some new external data with it later...
* `poster_path` : a link to the poster picture (no need for now, if we want to use some ensemble techniques)

In [None]:
train = train.drop(['imdb_id', 'poster_path'], axis=1)
test = test.drop(['imdb_id', 'poster_path'], axis=1)

### Let's look at numbers first!
The quantitative features that could be helpful are:
* the **budget**
* the **popularity**
* the **runtime**
* and the target : **revenue**

In [None]:
# Replace the nan values of the 'runtime' in both datasets by the mean of other movies' runtime
train.runtime[train.runtime.isna()] = train.runtime.mean()
test.runtime[test.runtime.isna()] = test.runtime.mean()

In [None]:
f = ['budget', 'popularity', 'runtime', 'revenue']
sns.pairplot(train[f].dropna())
None

At first look, the budget and the revenue seem correlated!

### Language

#### Number of spoken languages

The spoken languages are contained in a list of dictionaries, represented by a string, let's symplify it.

In [None]:
print("raw format:", train.spoken_languages.iloc[0])

train.spoken_languages = train.spoken_languages.apply(lambda x: list(map(lambda d: list(d.values())[0], ast.literal_eval(x)) if isinstance(x, str) else []))
test.spoken_languages = test.spoken_languages.apply(lambda x: list(map(lambda d: list(d.values())[0], ast.literal_eval(x)) if isinstance(x, str) else []))

train.head().spoken_languages

Now we can create 2 additional features : the number of spoken languages, and wheter the english belongs to them.

In [None]:
train['nb_spoken_languages'] = train.spoken_languages.apply(len)
test['nb_spoken_languages'] = test.spoken_languages.apply(len)

train['english_spoken'] = train.spoken_languages.apply(lambda x: 'en' in x)
test['english_spoken'] = test.spoken_languages.apply(lambda x: 'en' in x)

In [None]:
train.nb_spoken_languages.value_counts()

There is one film in witch 9 languages are spoken ! Or maybe it is the number of languages in which the film has been translated...

#### Original Language
Let's see what are the principal main original  languages in both train and test data :

In [None]:
all_lang = pd.concat([train.original_language, test.original_language], axis=0).value_counts()
all_lang[all_lang>20]

In [None]:
# Here are the main languages
main_languages = list(all_lang[all_lang>20].index)
# Let's categorize them, and add a 'other' catergorie
dict_language = dict(zip(main_languages, range(1, len(main_languages)+1)))
dict_language['other'] = 0

train.original_language = train.original_language.apply(lambda x: x if x in main_languages else 'other')
test.original_language = test.original_language.apply(lambda x: x if x in main_languages else 'other')

train['language'] = train.original_language.apply(lambda x: dict_language[x])
test['language'] = test.original_language.apply(lambda x: dict_language[x])

### Movie genre

Let's look at the different genres associated with the movies.

In [None]:
# Apply the same preprocessing on the string values
train.genres = train.genres.apply(lambda x: list(map(lambda d: list(d.values())[1], ast.literal_eval(x)) if isinstance(x, str) else []))
test.genres = test.genres.apply(lambda x: list(map(lambda d: list(d.values())[1], ast.literal_eval(x)) if isinstance(x, str) else []))

train.genres.head()

Here is the distribution of the number of genres per movie. There are 3 films with 7 genres, that's a lot!

In [None]:
train.genres.apply(len).value_counts().sort_index()

In [None]:
for v in train[train.genres.apply(len)==7][['title', 'genres']].values:
    print('film:', v[0], '\ngenres:', *v[1], '\n')

Let's regroup all the existing genres :

In [None]:
genres = Counter(itertools.chain.from_iterable(pd.concat((train.genres, test.genres), axis=0).values))
print("Number of different movie genres:", len(genres))
print()
print("Genre frequency:\n"+'\n'.join(['{} : {}'.format(g, genres[g]) for g in genres]))

As the amount of train sample is limited (3000), I don't want to add too much features. 

Thus instead of creating 20 categorical features, one for each genre, let's reduce those categories in a smaller space thanks to **SVD**.

In [None]:
%%time
temp_train = train[['id', 'genres']]
temp_test = test[['id', 'genres']]
for g in genres:
    temp_train[g] = temp_train.genres.apply(lambda x: 1 if g in x else 0)
    temp_test[g] = temp_test.genres.apply(lambda x: 1 if g in x else 0)
    
X_train = temp_train.drop(['genres', 'id'], axis=1).values
X_test = temp_test.drop(['genres', 'id'], axis=1).values

# Number of features we want for genres
n_comp_genres = 3

# Build the SVD pipeline
svd = make_pipeline(
    TruncatedSVD(n_components=n_comp_genres),
    Normalizer(norm='l2', copy=False)
)

# Here are our new features
f_train = svd.fit_transform(X_train)
f_test = svd.transform(X_test)

In [None]:
for i in range(n_comp_genres):
    train['genres_reduced_{}'.format(i)] = f_train[:, i]
    test['genres_reduced_{}'.format(i)] = f_test[:, i]

### Other multicategorical variables

The same reasoning is applicable to the other multicategorical variables :
* `production_companies`
* `production_countries`
* `Keywords` and `crew` but those are very sparse, as the number of total keywords or crew members can grow very fast.

In [None]:
# Apply the same preprocessing on the string values
train.production_companies = train.production_companies.apply(lambda x: list(map(lambda d: list(d.values())[0], ast.literal_eval(x)) if isinstance(x, str) else []))
test.production_companies = test.production_companies.apply(lambda x: list(map(lambda d: list(d.values())[0], ast.literal_eval(x)) if isinstance(x, str) else []))

train.production_countries = train.production_countries.apply(lambda x: list(map(lambda d: list(d.values())[0], ast.literal_eval(x)) if isinstance(x, str) else []))
test.production_countries = test.production_countries.apply(lambda x: list(map(lambda d: list(d.values())[0], ast.literal_eval(x)) if isinstance(x, str) else []))

train.Keywords = train.Keywords.apply(lambda x: list(map(lambda d: list(d.values())[1], ast.literal_eval(x)) if isinstance(x, str) else []))
test.Keywords = test.Keywords.apply(lambda x: list(map(lambda d: list(d.values())[1], ast.literal_eval(x)) if isinstance(x, str) else []))

In [None]:
# Production companies
production_companies = Counter(itertools.chain.from_iterable(pd.concat((train.production_companies, test.production_companies), axis=0).values))
print("Number of different production companies:", len(production_companies))

# Production countries
production_countries = Counter(itertools.chain.from_iterable(pd.concat((train.production_countries, test.production_countries), axis=0).values))
print("Number of different production countries:", len(production_countries))

In [None]:
# Add the number of each categorical feature  per film in he features
train['nb_production_companies'] = train.production_companies.apply(len)
test['nb_production_companies'] = test.production_companies.apply(len)

train['nb_production_countries'] = train.production_countries.apply(len)
test['nb_production_countries'] = test.production_countries.apply(len)

train['nb_keywords'] = train.Keywords.apply(len)
test['nb_keywords'] = test.Keywords.apply(len)

In [None]:
%%time
print('Applying SVD on production companies to create reduced features')

# Factorizing all the little production companies into an 'other' variable
big_companies = [p for p in production_companies if production_companies[p] > 30]
train.production_companies = train.production_companies.apply(lambda l: list(map(lambda x: x if x in big_companies else 'other', l)))

temp_train = train[['id', 'production_companies']]
temp_test = test[['id', 'production_companies']]

for p in big_companies + ['other']:
    temp_train[p] = temp_train.production_companies.apply(lambda x: 1 if p in x else 0)
    temp_test[p] = temp_test.production_companies.apply(lambda x: 1 if p in x else 0)
    
X_train = temp_train.drop(['production_companies', 'id'], axis=1).values
X_test = temp_test.drop(['production_companies', 'id'], axis=1).values

# Number of features we want for genres
n_comp_production_companies = 3

# Build the SVD pipeline
svd = make_pipeline(
    TruncatedSVD(n_components=n_comp_production_companies),
    Normalizer(norm='l2', copy=False)
)

# Here are our new features
f_train = svd.fit_transform(X_train)
f_test = svd.transform(X_test)

for i in range(n_comp_production_companies):
    train['production_companies_reduced_{}'.format(i)] = f_train[:, i]
    test['production_companies_reduced_{}'.format(i)] = f_test[:, i]

In [None]:
%%time
print('Applying SVD on production countries to create reduced features')

temp_train = train[['id', 'production_countries']]
temp_test = test[['id', 'production_countries']]
for p in production_countries:
    temp_train[p] = temp_train.production_countries.apply(lambda x: 1 if p in x else 0)
    temp_test[p] = temp_test.production_countries.apply(lambda x: 1 if p in x else 0)
    
X_train = temp_train.drop(['production_countries', 'id'], axis=1).values
X_test = temp_test.drop(['production_countries', 'id'], axis=1).values

# Number of features we want for genres
n_comp_production_countries = 3

# Build the SVD pipeline
svd = make_pipeline(
    TruncatedSVD(n_components=n_comp_production_countries),
    Normalizer(norm='l2', copy=False)
)

# Here are our new features
f_train = svd.fit_transform(X_train)
f_test = svd.transform(X_test)

for i in range(n_comp_production_countries):
    train['production_countries_reduced_{}'.format(i)] = f_train[:, i]
    test['production_countries_reduced_{}'.format(i)] = f_test[:, i]

# Models

In [None]:
features = ['budget', 
            'popularity', 
            'runtime', 
            'nb_spoken_languages', 
            'english_spoken', 
            'language',
            'nb_production_companies',
            'nb_production_countries',
            'nb_keywords'
           ]
features += ['production_companies_reduced_{}'.format(i) for i in range(n_comp_production_companies)]
features += ['production_companies_reduced_{}'.format(i) for i in range(n_comp_production_countries)]
features += ['genres_reduced_{}'.format(i) for i in range(n_comp_genres)]

In [None]:
X = train[features]
y = train.revenue.apply(np.log10)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42, shuffle=True)

In [None]:
params = {'objective':'regression',
          'num_leaves' : 40,
          'min_data_in_leaf' : 20,
          'max_depth' : 6,
          'learning_rate': 0.001,
          "metric": 'rmse',
          "random_state" : 42,
          "lambda_l2" : 0.005,
          "verbosity": -1}

In [None]:
# create dataset for lightgbm
lgb_train = lgb.Dataset(X_train, y_train)
lgb_eval = lgb.Dataset(X_test, y_test, reference=lgb_train)

In [None]:
print('Starting training...')
# train
gbm = lgb.train(params,
                lgb_train,
                num_boost_round=5000,
                valid_sets=lgb_eval,
                early_stopping_rounds=5)

In [None]:
X_test = test[features]
y_pred = 10**gbm.predict(X_test)
pd.DataFrame({'id': test.id, 'revenue': y_pred}).to_csv('submission.csv', index=False)