In [None]:
import pandas as pd
import seaborn as sns
import numpy as np
import matplotlib.pyplot as plt

import os
print(os.listdir('../input'))

In [None]:
df = pd.read_csv('../input/train.csv')
test_df = pd.read_csv('../input/test.csv')

In [None]:
df.head()

In [None]:
df.describe()

# Some feature engineering

In [None]:
# remove useless features
df.drop(['imdb_id', 'poster_path'], axis=1, inplace=True)
test_df.drop(['imdb_id', 'poster_path'], axis=1, inplace=True)

In [None]:
# a function for encoding NaN's to zeros and other values to ones of a selected feature
def binary_encode(feature):
    encoder = lambda x: 0 if pd.isna(x) else 1
    df[feature] = df[feature].apply(encoder)
    test_df[feature] = test_df[feature].apply(encoder)

In [None]:
# convert homepage and belongs_to_collection to binary values
binary_encode('homepage')
binary_encode('belongs_to_collection')

In [None]:
# create a binary feature showing if original language is english
english_identifier = lambda lang: 1 if lang == 'en' else 0

df['english_original'] = df['original_language'].apply(english_identifier)
test_df['english_original'] = test_df['original_language'].apply(english_identifier)

In [None]:
# count number of genres in a movie and make it a feature
def count_num(feature):
    name = 'num_' + feature
    counter = lambda x: 0 if pd.isna(x) else len(eval(x))
    df[name] = df[feature].apply(counter)
    test_df[name] = test_df[feature].apply(counter)

In [None]:
# count number of genres, spoken_languages, production_companies, production_countries, cast, crew, Keywords
# and create new features from them
features = ['genres', 'spoken_languages', 'production_companies', 'production_countries', 'cast', 'crew', 'Keywords']
[count_num(feature) for feature in features]

In [None]:
# Noticed that 'runtime' has two nulls, let's fill them with mean
df['runtime'] = df['runtime'].fillna(df['runtime'].mean())
test_df['runtime'] = test_df['runtime'].fillna(test_df['runtime'].mean())

In [None]:
df.head()

In [None]:
df.describe()

# Plotting

In [None]:
# Revenue for movies belonging to collection vs movies that don't belong to a collection
sns.catplot(x='belongs_to_collection', y='revenue', data=df)

From the plot we can see that some movies that belong to a collection manage to make a bigger revenue.
Probably because they have a bigger fan base.

In [None]:
# Revenue for movies with a homepage vs movies without one
sns.catplot(x='homepage', y='revenue', data=df)

The plot tells us that movies with a homepage tend to make a much bigger revenue than movies without one.

In [None]:
sns.catplot(x='english_original', y='revenue', data=df)

Movies where english is the original language tend to make way higher revenues, also most of the movies in this dataset have their original language as english.

In [None]:
sns.catplot(x='num_Keywords', y='revenue', data=df)

This graph tells us that movies that have between 5 and about 20 keywords also have the highest revenue values.

In [None]:
sns.catplot(x='num_crew', y='revenue', data=df)

It seems that the crew size isn't really impacting the revenue of a movie.

In [None]:
sns.catplot(x='num_cast', y='revenue', data=df)

Same goes for the cast size.

In [None]:
sns.catplot(x='num_production_companies', y='revenue', data=df)

From the graph we can see that the highest revenue belongs to the movies that have 1-5 production companies involved.
Also it seems like there is one **OUTLIER** where a movie with 10 production companies has got a very high revenue.

In [None]:
sns.catplot(x='num_production_countries', y='revenue', data=df)

Movies with the highest revenue have only 1 or 2 countries involved in production.

In [None]:
sns.catplot(x='num_spoken_languages', y='revenue', data=df)

Most of the movies have only 1 or 2 spoken languages.
The highest revenues belong to movies that have one spoken language, surely it's english.

In [None]:
sns.catplot(x='num_genres', y='revenue', data=df)

Interestingly, highest revenue is made by movies that have a combination of 3 genres.

In [None]:
features_of_interest = ['revenue', 'belongs_to_collection', 'budget', 'popularity', 'runtime', 'english_original',
                       'num_genres', 'num_spoken_languages', 'num_production_companies', 'num_production_countries',
                        'num_Keywords']
sns.heatmap(df[features_of_interest].corr(), xticklabels=features_of_interest,yticklabels=features_of_interest)

Budget and popularity have the highest correlation with revenue.

In [None]:
plt.scatter(df['runtime'], df['revenue'])

This graph shows us that movies with a runtime from about 80 to about 180 minutes have the highest revenues.
Not too long and not too short I guess.

In [None]:
plt.scatter(df['popularity'], df['revenue'])

Not quite sure what is represented by popularity here. But the graph seems to show that most movies have a low popularity rating, while really popular movies tend to have higher than average revenues.

In [None]:
plt.scatter(df['budget'], df['revenue'])

Finally, the most correlated variable - **budget**.
It seems that we have a linear relationship between budget and revenue. Meaning in most cases, the higher the budget, the higher the revenue.

# Try out different base regressors

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.tree import DecisionTreeRegressor
from sklearn.svm import SVR
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor, AdaBoostRegressor

from sklearn.model_selection import cross_val_score

In [None]:
def score_model(model):
    scores = cross_val_score(model, features, labels, scoring="neg_mean_squared_error", cv=10)
    rmse_scores = np.sqrt(-scores)
    print(rmse_scores.mean())

In [None]:
prediction_features = ['belongs_to_collection', 'budget', 'popularity', 'runtime', 'english_original', 'num_genres',
                  'num_spoken_languages', 'num_production_companies', 'num_production_countries', 'num_Keywords']

features = df[prediction_features]
labels = df['revenue']

In [None]:
score_model(LinearRegression())

In [None]:
score_model(DecisionTreeRegressor())

In [None]:
score_model(SVR(gamma='auto'))

In [None]:
score_model(RandomForestRegressor(n_estimators=10))

In [None]:
score_model(GradientBoostingRegressor())

In [None]:
score_model(AdaBoostRegressor())

In [None]:
from xgboost.sklearn import XGBRegressor
score_model(XGBRegressor())

**The XGBRegressor and GradientBoostingRegressor produced the lowest error!**

In [None]:
from sklearn.model_selection import GridSearchCV

parameters = {
    'learning_rate':[0.01], 
    'n_estimators':[500],
    'max_features': ['sqrt'],
    'min_samples_split': [9],
    'subsample': [0.7]
}

cv = GridSearchCV(GradientBoostingRegressor(random_state=42), parameters, cv=10, scoring="neg_mean_squared_error")
cv.fit(features, labels)

In [None]:
np.sqrt(-cv.best_score_), cv.best_params_

In [None]:
import eli5
from eli5.sklearn import PermutationImportance

perm = PermutationImportance(cv).fit(features, labels)
eli5.show_weights(perm, feature_names=prediction_features)

In [None]:
parameters = {'nthread':[4],
              'objective':['reg:linear'],
              'learning_rate': [0.03],
              'max_depth': [6],
              'min_child_weight': [4],
              'silent': [1],
              'subsample': [0.7],
              'colsample_bytree': [0.7],
              'n_estimators': [500]}

xgb = GridSearchCV(XGBRegressor(random_state=42), parameters, cv=10, scoring="neg_mean_squared_error")
xgb.fit(features, labels)

In [None]:
np.sqrt(-xgb.best_score_), xgb.best_params_

In [None]:
pred = xgb.predict(test_df[prediction_features])

In [None]:
submissions = pd.DataFrame({'id': test_df.id, 'revenue': pred})
submissions.to_csv('submissions.csv', index=False)