In [None]:
# This Python 3 environment comes with many helpful analytics libraries installed
# It is defined by the kaggle/python docker image: https://github.com/kaggle/docker-python
# For example, here's several helpful packages to load in 

import numpy as np # linear algebra
import pandas as pd # data processing, CSV file I/O (e.g. pd.read_csv)
import matplotlib
import matplotlib.pyplot as plt
%matplotlib inline

# Input data files are available in the "../input/" directory.
# For example, running this (by clicking run or pressing Shift+Enter) will list the files in the input directory

import os
print(os.listdir("../input"))

# Any results you write to the current directory are saved as output.

## Load Train and Test Data

In [None]:
train_input = pd.read_csv("../input/train.csv")
print(train_input.shape)

train_X = train_input.iloc[:, :-1]
train_X.head()

In [None]:
train_Y = train_input.iloc[:, -1]
train_Y.head()

In [None]:
test = pd.read_csv("../input/test.csv")
print(test.shape)
test.head()

## Data Cleaning

### Drop Columns

In [None]:
train_X = train_X.drop(columns=["homepage", "imdb_id", "poster_path", "status"])

train_X.head()

### Categorical to Numeric Tranformation
#### original_language

In [None]:
print("ratio of ocurrences of \"en\" to all data: %f" % (len([i for i in train_X["original_language"] if i == "en"])/len(train_X["original_language"])))

unique, counts = np.unique([i for i in train_X["original_language"]], return_counts=True)
language_counts = dict(zip(unique, counts))

plt.figure(figsize=(18, 5))
plt.bar(*np.unique([i for i in train_X["original_language"] if i != "en"], return_counts=True))
plt.show()

percentage_required = 0.01

languages = [i for i in language_counts.keys() if language_counts[i]/len(train_X["original_language"]) > percentage_required]

print("Languages that make up at least " + str(percentage_required * 100) + "% of the data " + str(languages))
print("Data % represented: " + str(len([i for i in train_X["original_language"] if i in languages])/len(train_X["original_language"])))

one_hot_lang = pd.get_dummies(pd.Categorical(train_X["original_language"]), prefix="language")[["language_" + i for i in languages]]
print(one_hot_lang.head())

train_X = pd.concat([train_X.drop(columns="original_language"), one_hot_lang], axis=1)
train_X.head()

#### belongs_to_collection

In [None]:
import ast

btc = train_X["belongs_to_collection"].apply(lambda x: [i["id"] for i in ast.literal_eval(x)] if type(x) == str else [])
print(len(np.unique(btc)))

# high cardinality, will try using if it belongs to a collection or not as a feature instead

btc = train_X["belongs_to_collection"].apply(lambda x: 1 if type(x) == str else 0)
btc.head()

train_X["belongs_to_collection"] = btc
train_X.head()

In [None]:
from typing import List, Set, Dict, Tuple
from collections import defaultdict
import operator

def get_list_of_dicts_col(col_name: str, dict_key: str= None) -> List[List]:
    if dict_key is None:
        return [ast.literal_eval(i) for i in train_input[col_name] if type(i) == str]
    else:
        return [[x[dict_key] for x in ast.literal_eval(i)] for i in train_input[col_name] if type(i) == str]

def get_sorted_unique_value_count_in_list_of_lists(list_of_lists: str or List[List], col_name: str = None, dict_key: str= None) -> List[Tuple]:
    item_count = defaultdict(int)
    
    if type(list_of_lists) == str:
        assert col_name is not None and dict_key is not None, "specify the column name dict key for which to count unique values"
        list_of_lists = get_list_of_dicts_col(col_name, dict_key)

    for list_ in list_of_lists:
        for item in list_:
            item_count[item] += 1

    return sorted(item_count.items(), key=operator.itemgetter(1), reverse=True)

def unzip_tuples(list_of_tuples: List[Tuple]) -> List[List]:
    return [list(i) for i in list(list(zip(*list_of_tuples)))]

def get_items_with_minimum_percentage(percentage: float, list_of_tuples: List[Tuple], col_name: str) -> List[Tuple]:
    return [i for i in list_of_tuples if i[1]/len(train_input[col_name]) > percentage]

def get_one_hot_df(columns: List, list_of_item_lists: List[List], column_name_prepend: str) -> pd.DataFrame:
    one_hot_df = pd.DataFrame(columns=[column_name_prepend + str(i) for i in columns])

    for item_list in list_of_item_lists:
        row = []
        for item in columns:
            if item in item_list:
                row.append(1)
            else:
                row.append(0)
            
        one_hot_df = one_hot_df.append(pd.DataFrame([row], columns=[column_name_prepend + str(i) for i in columns]), ignore_index=True)
    return one_hot_df

#### genres

In [None]:
genre_col = get_list_of_dicts_col("genres", "id")

genre_count_tuples = get_sorted_unique_value_count_in_list_of_lists(genre_col)
        
print("Number of unique genres: %d" % len(genre_count_tuples))

genre_list = unzip_tuples(genre_count_tuples)[0]
print(genre_list)

one_hot_genre = get_one_hot_df(genre_list, genre_col, "genre_")
train_X = pd.concat([train_X.drop(columns="genres"), one_hot_genre], axis=1)
train_X.head()

#### production_countries

In [None]:
country_col = get_list_of_dicts_col("production_countries", "iso_3166_1")
country_count_tuples = get_sorted_unique_value_count_in_list_of_lists(country_col)
    
print("Number of unique countries: %d" % len(country_count_tuples))

countries_list = unzip_tuples(country_count_tuples)[0]
countries_counts_list = unzip_tuples(country_count_tuples)[1]

plt.figure(figsize=(20,8))
plt.bar(countries_list[:40], countries_counts_list[:40])
plt.title("Top 40 Countries")
plt.show()


country_count_tuples = get_items_with_minimum_percentage(0.01, country_count_tuples, "production_countries")

countries_list = unzip_tuples(country_count_tuples)[0]
countries_counts_list = unzip_tuples(country_count_tuples)[1]

print("Countries that make up at least " + str(percentage_required * 100) + "% of the data " + str(countries_list))

one_hot_country = get_one_hot_df(countries_list, country_col, "country_")

train_X = pd.concat([train_X.drop(columns="production_countries"), one_hot_country], axis=1)
train_X.head()

#### Production Companies

In [None]:
company_col = get_list_of_dicts_col("production_companies", "id")
company_count_tuples = get_sorted_unique_value_count_in_list_of_lists(company_col)
    
print("Number of unique companies: %d" % len(company_count_tuples))

companies_list = unzip_tuples(company_count_tuples)[0]
companies_counts_list = unzip_tuples(company_count_tuples)[1]

plt.figure(figsize=(20,8))
plt.bar([str(i) for i in companies_list[:40]], companies_counts_list[:40])
plt.title("Top 40 Companies")
plt.show()

company_count_tuples = get_items_with_minimum_percentage(0.01, company_count_tuples, "production_companies")

companies_list = unzip_tuples(company_count_tuples)[0]
companies_counts_list = unzip_tuples(company_count_tuples)[0]

print("Companies that make up at least " + str(percentage_required * 100) + "% of the data " + str(companies_list))

one_hot_company = get_one_hot_df(companies_list, company_col, "company_")

train_X = pd.concat([train_X.drop(columns="production_companies"), one_hot_company], axis=1)
train_X.head()

#### Keywords

In [None]:
keywords_col = get_list_of_dicts_col("Keywords", "id")
keywords_count_tuples = get_sorted_unique_value_count_in_list_of_lists(keywords_col)
    
print("Number of unique keywords: %d" % len(keywords_count_tuples))

keywords_list = unzip_tuples(keywords_count_tuples)[0]
keywords_counts_list = unzip_tuples(keywords_count_tuples)[1]

plt.figure(figsize=(20,8))
plt.bar([str(i) for i in keywords_list[:40]], keywords_counts_list[:40])
plt.title("Top 40 Keywords")
plt.show()

keywords_count_tuples = get_items_with_minimum_percentage(0.02, keywords_count_tuples, "Keywords")

keywords_list = unzip_tuples(keywords_count_tuples)[0]
keywords_counts_list = unzip_tuples(keywords_count_tuples)[1]

print("Number of keywords used for one hot encoding: " + str(len(keywords_list)))

one_hot_keywords = get_one_hot_df(keywords_list, keywords_col, "keyword_")

train_X = pd.concat([train_X.drop(columns="Keywords"), one_hot_keywords], axis=1)
train_X.head()

In [None]:
poster_url_head = "https://image.tmdb.org/t/p/w600_and_h900_bestv2"
from functools import reduce

plt.figure(figsize=(15,5))

plt.subplot(131)
plt.hist(train_X["budget"], bins=np.linspace(0, 3e8, 50))
plt.title("Budget")

plt.subplot(132)
plt.hist(train_Y, bins=np.linspace(0, 3.5e8, 50))
plt.title("Revenue")

plt.subplot(133)
plt.hist(train_X["popularity"], bins=np.linspace(0, 200, 100))
plt.title("Popularity")
plt.show()

In [None]:
# print(len([i for i in zip(train["original_title"], train_X["budget"]) if i[1] != 0 and i[1] < 700]))


In [None]:
import ast

# genres = [ast.literal_eval(i) for i in train["genres"] if type(i) == str]
print([i[1] for i in zip(train_X["genres"], train_X["original_title"]) if type(i[0]) != str])

In [None]:
# import seaborn as sns
# 
# colormap = plt.cm.RdBu
# plt.figure(figsize=(14,12))
# plt.title('Pearson Correlation of Features', y=1.05, size=15)
# sns.heatmap(train.astype(float).corr(),linewidths=0.1,vmax=1.0, 
#             square=True, cmap=colormap, linecolor='white', annot=True)