# Data Preprocessing

## Import Lib

In [1]:
import numpy as np
import pandas as pd
import ast

## Load csv

In [2]:
train_df = pd.read_csv('./dataset/train.csv')
test_df = pd.read_csv('./dataset/test.csv')
print('train shape:', train_df.shape)
print('test shape:', test_df.shape)

# concat datasets for combined modifications
train_y_df = train_df['revenue']
del train_df['revenue']
concat_df = pd.concat(objs=[train_df, test_df], axis=0)
print('concat shape:', concat_df.shape)

# free memory
del train_df
del test_df

train shape: (3000, 23)
test shape: (4398, 22)
concat shape: (7398, 22)


## Remove unused fields

In [3]:
for field in ['homepage', 'imdb_id', 'original_title', 'overview', 'poster_path', 'release_date', 'status', 'tagline', 'title', 'cast', 'crew']:
    del concat_df[field]
print('concat shape:', concat_df.shape)

concat shape: (7398, 11)


## Convert json str to list of ids
ids separated by `|` for one hot encoding later

In [4]:
for field in ['belongs_to_collection', 'genres', 'production_companies', 'production_countries', 'spoken_languages', 'Keywords']:
    if field == 'production_countries':
        id_field = 'iso_3166_1'
    elif field == 'spoken_languages':
        id_field = 'iso_639_1'
    else:
        id_field = 'id'
    concat_df[field] = concat_df[field].replace(np.nan, '[]').apply(ast.literal_eval).apply(lambda a: [str(b[id_field]) for b in a]).apply(lambda a:'|'.join(a))
concat_df.head()

Unnamed: 0,id,belongs_to_collection,budget,genres,original_language,popularity,production_companies,production_countries,runtime,spoken_languages,Keywords
0,1,313576.0,14000000,35,en,6.575393,4|60|8411,US,93.0,en,4379|9663|11830|179431
1,2,107674.0,40000000,35|18|10751|10749,en,8.248895,2,US,113.0,en,2505|4263|6038|13072
2,3,,3300000,18,en,64.29999,2266|3172|32157,US,105.0,en,1416|1523|1640|2176|14512|14819|33896|156823|1...
3,4,,1200000,53|18,hi,3.174936,,IN,122.0,en|hi,10092|10540|11734|14536|14636|208364|220935
4,5,,0,28|53,ko,1.14807,,KR,118.0,ko,


## One-hot encoding for categorical fields

In [5]:
for field in ['belongs_to_collection', 'genres', 'original_language', 'production_companies', 'production_countries', 'spoken_languages', 'Keywords']:
    dummy = concat_df[field].str.get_dummies(sep='|').add_prefix(field+'_')
    print(field+' dummy shape:', dummy.shape)
    concat_df = pd.concat(objs=[concat_df, dummy], axis=1)
    del concat_df[field]
concat_df.head()

belongs_to_collection dummy shape: (7398, 750)
genres dummy shape: (7398, 20)
original_language dummy shape: (7398, 44)
production_companies dummy shape: (7398, 7139)
production_countries dummy shape: (7398, 98)
spoken_languages dummy shape: (7398, 98)
Keywords dummy shape: (7398, 11930)


Unnamed: 0,id,budget,popularity,runtime,belongs_to_collection_10,belongs_to_collection_100286,belongs_to_collection_1006,belongs_to_collection_100693,belongs_to_collection_100965,belongs_to_collection_100970,...,Keywords_9974,Keywords_9977,Keywords_9986,Keywords_9988,Keywords_9989,Keywords_999,Keywords_9990,Keywords_9991,Keywords_9993,Keywords_9995
0,1,14000000,6.575393,93.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
1,2,40000000,8.248895,113.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
2,3,3300000,64.29999,105.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,1200000,3.174936,122.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,0,1.14807,118.0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


## Export

In [6]:
train_df = concat_df[:len(train_y_df)]
test_df = concat_df[len(train_y_df):]
print('train_x shape:', train_df.shape)
print('train_y shape:', train_y_df.shape)
print('test shape:', test_df.shape)

# TODO: save as npz
train_x = train_df.values
train_y = train_y_df.values
test_x = test_df.values
np.savez('./dataset/preprocessed.npz', train_x=train_x, train_y=train_y, test_x=test_x)

train_x shape: (3000, 20083)
train_y shape: (3000,)
test shape: (4398, 20083)
