In [None]:
import numpy as np
import pandas as pd
from sklearn import *
import xgboost as xgb

train = pd.read_csv('../input/train.csv')
test = pd.read_csv('../input/test.csv')
sub = pd.read_csv('../input/sample_submission.csv')
train.shape, test.shape, sub.shape

In [None]:
#Additional feature columns for image or sentiment analysis, dropping for now
train.drop(columns=['imdb_id', 'homepage', 'poster_path'], inplace=True)
test.drop(columns=['imdb_id', 'homepage', 'poster_path'], inplace=True)

In [None]:
def date_features(df):
    df['release_date'] = pd.to_datetime(df['release_date'])
    df['release_year'] = df['release_date'].dt.year
    df['release_month'] = df['release_date'].dt.month
    df['release_quarter'] = df['release_date'].dt.quarter
    df['release_dow'] = df['release_date'].dt.dayofweek
    df.drop(columns=['release_date'], inplace=True)
    return df

train = date_features(train)
test = date_features(test)

In [None]:
def get_dictionary(s):
    try:
        d = eval(s)
    except:
        d = {}
    return d

In [None]:
#could test dummies but going with simple encoding
train.belongs_to_collection = train.belongs_to_collection.map(lambda x: len(get_dictionary(x))).clip(0,1)
test.belongs_to_collection = test.belongs_to_collection.map(lambda x: len(get_dictionary(x))).clip(0,1)

#pd.concat((train, train.genres.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).str.get_dummies(sep=',')), axis=1) #need to concat train/test first
train.genres = train.genres.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
test.genres = test.genres.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))

train.production_companies = train.production_companies.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
test.production_companies = test.production_companies.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))

train.production_countries = train.production_countries.map(lambda x: sorted([d['iso_3166_1'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
test.production_countries = test.production_countries.map(lambda x: sorted([d['iso_3166_1'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))

train.spoken_languages = train.spoken_languages.map(lambda x: sorted([d['iso_639_1'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
test.spoken_languages= test.spoken_languages.map(lambda x: sorted([d['iso_639_1'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))

train.Keywords = train.Keywords.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))
test.Keywords = test.Keywords.map(lambda x: sorted([d['id'] for d in get_dictionary(x)])).map(lambda x: ','.join(map(str, x)))

for c in ['original_language', 'status', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'Keywords']:
    lbl = preprocessing.LabelEncoder()
    lbl.fit(list(train[c].fillna('').astype(str)) + list(test[c].fillna('').astype(str)))
    train[c] = lbl.transform(train[c].fillna(''))
    test[c] = lbl.transform(test[c].fillna(''))
    print(c, len(lbl.classes_))

In [None]:
#dropping cast and crew but probably great graph features along with above dictionary items - analysis for another time
train.drop(columns=['cast', 'crew'], inplace=True)
test.drop(columns=['cast', 'crew'], inplace=True)

In [None]:
#dropping text features for now
def standard_text_features(df):
    for c in ['original_title', 'title', 'tagline', 'overview']:
        df[c + '_len'] = df[c].map(lambda x: len(str(x)))
        df[c + '_wlen'] = df[c].map(lambda x: len(str(x).split(' ')))
    df.drop(columns=['original_title', 'title', 'tagline', 'overview'], inplace=True)
    return df

train = standard_text_features(train)
test = standard_text_features(test)

In [None]:
col = [c for c in train.columns if c not in ['id', 'revenue']]

params = {'eta': 0.02, 'objective': 'reg:linear', 'max_depth': 6, 'subsample': 0.8, 'colsample_bytree': 0.8,  'eval_metric': 'rmse', 'seed': 1, 'silent': True}
x1, x2, y1, y2 = model_selection.train_test_split(train[col].fillna(-1), np.log1p(train['revenue']), test_size=0.2, random_state=1)
watchlist = [(xgb.DMatrix(x1, y1), 'train'), (xgb.DMatrix(x2, y2), 'valid')]
model = xgb.train(params, xgb.DMatrix(x1, y1), 2500,  watchlist, verbose_eval=100, early_stopping_rounds=200)
test['revenue'] = np.expm1(model.predict(xgb.DMatrix(test[col]), ntree_limit=model.best_ntree_limit))
test[['id', 'revenue']].to_csv('submission.csv', index=False)
xgb.plot_importance(model, importance_type='weight', max_num_features=20)