<a href="https://colab.research.google.com/github/galhev/Featuretools/blob/master/featuretools.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Preprocessing with Featuretools & Sklearn-pandas

In [2]:
# Install package
pip install featuretools



In [3]:
# Install package
pip install sklearn-pandas



In [0]:
# Visualization:
# https://www.kaggle.com/rounakbanik/the-story-of-film

# Download dataset:
# https://www.kaggle.com/rounakbanik/the-movies-dataset#ratings.csv


In [0]:
import pandas as pd
from sklearn_pandas import DataFrameMapper, gen_features,  CategoricalImputer
import sklearn.preprocessing
from datetime import date
import numpy as np

In [5]:
# save model image file to Google Drive
from google.colab import drive
drive.mount('/content/gdrive')

Drive already mounted at /content/gdrive; to attempt to forcibly remount, call drive.mount("/content/gdrive", force_remount=True).


In [6]:
!ls -la /content/gdrive/My\ Drive/Colab\ Notebooks/featuretools

total 729065
-rw------- 1 root root    127307 Sep 16 13:52 featuretools.ipynb
-rw------- 1 root root  34445126 Sep 11 11:37 movies_metadata.csv
-rw------- 1 root root 709550327 Sep 11 11:38 ratings.csv
-rw------- 1 root root   2438266 Sep 15 18:27 ratings_small.csv


In [7]:
movies = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/featuretools/movies_metadata.csv')
ratings = pd.read_csv('/content/gdrive/My Drive/Colab Notebooks/featuretools/ratings.csv')

  interactivity=interactivity, compiler=compiler, result=result)


In [8]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [0]:
movies.rename(columns={'id': 'movieId'}, inplace=True)

In [0]:
movies['movieId'] = movies['movieId'].apply(lambda x: x if x.isdigit() else 999999999)

In [0]:
movies['budget'] = movies['budget'].apply(lambda x: x if x.isdigit() else 0)

In [0]:
movies['movieId'] = movies['movieId'].astype('int64')

In [13]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 24 columns):
adult                    45466 non-null object
belongs_to_collection    4494 non-null object
budget                   45466 non-null object
genres                   45466 non-null object
homepage                 7782 non-null object
movieId                  45466 non-null int64
imdb_id                  45449 non-null object
original_language        45455 non-null object
original_title           45466 non-null object
overview                 44512 non-null object
popularity               45461 non-null object
poster_path              45080 non-null object
production_companies     45463 non-null object
production_countries     45463 non-null object
release_date             45379 non-null object
revenue                  45460 non-null float64
runtime                  45203 non-null float64
spoken_languages         45460 non-null object
status                   45379 non-null object

In [14]:
movies.isna().sum()

adult                        0
belongs_to_collection    40972
budget                       0
genres                       0
homepage                 37684
movieId                      0
imdb_id                     17
original_language           11
original_title               0
overview                   954
popularity                   5
poster_path                386
production_companies         3
production_countries         3
release_date                87
revenue                      6
runtime                    263
spoken_languages             6
status                      87
tagline                  25054
title                        6
video                        6
vote_average                 6
vote_count                   6
dtype: int64

In [0]:
movies = movies.drop(['overview','homepage','original_title','imdb_id', 'belongs_to_collection', 'genres','poster_path', 'production_companies','production_countries','spoken_languages', 'tagline'], axis=1)

In [0]:
# Set up transformations for numerical and categorical columns
# col_categorical = list( movies.select_dtypes(exclude=[np.int64]) )

col_cat_list = list( movies.select_dtypes(exclude=np.number) )
col_num_list = list( movies.select_dtypes(include=np.number) )
col_none = ['movieId']

num_cols = ['budget', 'popularity']
[ col_cat_list.remove(x) for x in num_cols ]
[ col_num_list.append(x) for x in num_cols ]
col_cat_list.remove('release_date')
col_num_list.remove('movieId')

movies[col_cat_list] = movies[col_cat_list].astype('category')
# movies[col_num_list] = movies[col_num_list].astype('float64', errors='coerce')
movies['budget'] = pd.to_numeric(movies['budget'],errors="coerce")
movies['popularity'] = pd.to_numeric(movies['popularity'],errors="coerce")

# movies[col_num_list] = [ pd.to_numeric(movies[x], errors='coerce') for x in col_num_list ]

# Convert to list of lists
col_categorical = [ [x] for x in col_cat_list ]
col_numerical   = [ [x] for x in col_num_list ]
col_date = ['release_date']


In [17]:
movies.head()

Unnamed: 0,adult,budget,movieId,original_language,popularity,release_date,revenue,runtime,status,title,video,vote_average,vote_count
0,False,30000000,862,en,21.946943,1995-10-30,373554033.0,81.0,Released,Toy Story,False,7.7,5415.0
1,False,65000000,8844,en,17.015539,1995-12-15,262797249.0,104.0,Released,Jumanji,False,6.9,2413.0
2,False,0,15602,en,11.7129,1995-12-22,0.0,101.0,Released,Grumpier Old Men,False,6.5,92.0
3,False,16000000,31357,en,3.859495,1995-12-22,81452156.0,127.0,Released,Waiting to Exhale,False,6.1,34.0
4,False,0,11862,en,8.387519,1995-02-10,76578911.0,106.0,Released,Father of the Bride Part II,False,5.7,173.0


In [0]:
movies['release_date']=pd.to_datetime(movies['release_date'], errors="coerce")

In [19]:
movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 13 columns):
adult                45466 non-null category
budget               45466 non-null int64
movieId              45466 non-null int64
original_language    45455 non-null category
popularity           45460 non-null float64
release_date         45376 non-null datetime64[ns]
revenue              45460 non-null float64
runtime              45203 non-null float64
status               45379 non-null category
title                45460 non-null category
video                45460 non-null category
vote_average         45460 non-null float64
vote_count           45460 non-null float64
dtypes: category(5), datetime64[ns](1), float64(5), int64(2)
memory usage: 4.7 MB


In [0]:
#movies['release_date'].fillna('1900-01-01', inplace=True)

In [0]:
from sklearn.base import TransformerMixin

class DateEncoder(TransformerMixin):
  def fit(self, X, y=None):
    return self

  def transform(self, X):
    X = pd.to_datetime(movies['release_date'])
    dt = X.dt
    return pd.concat([dt.year, dt.month, dt.day],  axis=1)

In [0]:
classes_categorical = [ CategoricalImputer, sklearn.preprocessing.LabelEncoder]
classes_numerical = [ {'class':sklearn.preprocessing.Imputer, 'strategy' : 'median'}, sklearn.preprocessing.StandardScaler]
classes_dates = [DateEncoder]
classes_none = [None]

In [23]:
feature_def = gen_features(
    columns = col_categorical
    , classes = classes_categorical
)


feature_def_numerical = gen_features(
    columns = col_numerical
    , classes = classes_numerical
)


feature_def_date = gen_features(
    columns = col_date
    , classes = classes_dates
)


feature_def_none = gen_features(
    columns = col_none
    , classes = classes_none
)

feature_def.extend(feature_def_date)
feature_def.extend(feature_def_numerical)
feature_def.extend(feature_def_none)



In [24]:
feature_def

[(['adult'],
  [CategoricalImputer(copy=True, fill_value='?', missing_values='NaN',
                      strategy='most_frequent'), LabelEncoder()]),
 (['original_language'],
  [CategoricalImputer(copy=True, fill_value='?', missing_values='NaN',
                      strategy='most_frequent'), LabelEncoder()]),
 (['status'],
  [CategoricalImputer(copy=True, fill_value='?', missing_values='NaN',
                      strategy='most_frequent'), LabelEncoder()]),
 (['title'],
  [CategoricalImputer(copy=True, fill_value='?', missing_values='NaN',
                      strategy='most_frequent'), LabelEncoder()]),
 (['video'],
  [CategoricalImputer(copy=True, fill_value='?', missing_values='NaN',
                      strategy='most_frequent'), LabelEncoder()]),
 ('release_date', [<__main__.DateEncoder at 0x7ff81b9784a8>]),
 (['revenue'],
  [Imputer(axis=0, copy=True, missing_values='NaN', strategy='median', verbose=0),
   StandardScaler(copy=True, with_mean=True, with_std=True)]),
 (['runt

In [25]:
mapper = DataFrameMapper(feature_def , df_out = True)
new_df_movies = mapper.fit_transform(movies)

  y = column_or_1d(y, warn=True)


In [26]:
movies.head()

Unnamed: 0,adult,budget,movieId,original_language,popularity,release_date,revenue,runtime,status,title,video,vote_average,vote_count
0,False,30000000,862,en,21.946943,1995-10-30,373554033.0,81.0,Released,Toy Story,False,7.7,5415.0
1,False,65000000,8844,en,17.015539,1995-12-15,262797249.0,104.0,Released,Jumanji,False,6.9,2413.0
2,False,0,15602,en,11.7129,1995-12-22,0.0,101.0,Released,Grumpier Old Men,False,6.5,92.0
3,False,16000000,31357,en,3.859495,1995-12-22,81452156.0,127.0,Released,Waiting to Exhale,False,6.1,34.0
4,False,0,11862,en,8.387519,1995-02-10,76578911.0,106.0,Released,Father of the Bride Part II,False,5.7,173.0


In [0]:
new_df_movies.rename(columns={'release_date_0': 'year', 'release_date_1': 'month', 'release_date_2':'day'}, inplace=True)

In [28]:
new_df_movies.head()

Unnamed: 0,adult,original_language,status,title,video,year,month,day,revenue,runtime,vote_average,vote_count,budget,popularity,movieId
0,3,20,4,39018,0,1995.0,10.0,30.0,5.632841,-0.342939,1.081946,10.798693,1.479373,3.168317,862
1,3,20,4,16806,0,1995.0,12.0,15.0,3.911075,0.257643,0.666161,4.688049,3.488165,2.347099,8844
2,3,20,4,13371,0,1995.0,12.0,22.0,-0.174232,0.179307,0.458269,-0.036404,-0.24245,1.464059,15602
3,3,20,4,40428,0,1995.0,12.0,22.0,1.09198,0.858225,0.250376,-0.154464,0.675855,0.156246,31357
4,3,20,4,11199,0,1995.0,2.0,10.0,1.016224,0.309868,0.042484,0.128474,-0.24245,0.91029,11862


In [0]:
new_df_movies[col_cat_list] = new_df_movies[col_cat_list].astype('category')

In [30]:
new_df_movies.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 45466 entries, 0 to 45465
Data columns (total 15 columns):
adult                45466 non-null category
original_language    45466 non-null category
status               45466 non-null category
title                45466 non-null category
video                45466 non-null category
year                 45376 non-null float64
month                45376 non-null float64
day                  45376 non-null float64
revenue              45466 non-null float64
runtime              45466 non-null float64
vote_average         45466 non-null float64
vote_count           45466 non-null float64
budget               45466 non-null float64
popularity           45466 non-null float64
movieId              45466 non-null int64
dtypes: category(5), float64(9), int64(1)
memory usage: 5.4 MB


In [31]:
import featuretools as ft

ft.list_primitives()

Unnamed: 0,name,type,description
0,num_true,aggregation,Finds the number of 'True' values in a boolean.
1,count,aggregation,Counts the number of non null values.
2,mean,aggregation,Computes the average value of a numeric feature.
3,n_most_common,aggregation,Finds the N most common elements in a categori...
4,time_since_last,aggregation,Time since last related instance.
5,avg_time_between,aggregation,Computes the average time between consecutive ...
6,trend,aggregation,Calculates the slope of the linear trend of va...
7,median,aggregation,Finds the median value of any feature with wel...
8,mode,aggregation,Finds the most common element in a categorical...
9,min,aggregation,Finds the minimum non-null value of a numeric ...


In [0]:
new_df_movies = new_df_movies.drop_duplicates('movieId')

In [0]:
col_cat_list.append('year')
col_cat_list.append('month')
col_cat_list.append('day')

In [34]:
# initialize entityset
es = ft.EntitySet(id = 'movies_entitySet')
variable_types = {cat:ft.variable_types.Categorical for cat in col_cat_list}

es.entity_from_dataframe(entity_id = 'movies_entity_id', dataframe = new_df_movies,  make_index = False, index = 'movieId', variable_types = variable_types)


Entityset: movies_entitySet
  Entities:
    movies_entity_id [Rows: 45434, Columns: 15]
  Relationships:
    No relationships

In [35]:
es['movies_entity_id']

Entity: movies_entity_id
  Variables:
    movieId (dtype: index)
    revenue (dtype: numeric)
    runtime (dtype: numeric)
    vote_average (dtype: numeric)
    vote_count (dtype: numeric)
    budget (dtype: numeric)
    popularity (dtype: numeric)
    adult (dtype: categorical)
    original_language (dtype: categorical)
    status (dtype: categorical)
    title (dtype: categorical)
    video (dtype: categorical)
    year (dtype: categorical)
    month (dtype: categorical)
    day (dtype: categorical)
  Shape:
    (Rows: 45434, Columns: 15)

In [36]:
es['movies_entity_id'].df

Unnamed: 0,movieId,revenue,runtime,vote_average,vote_count,budget,popularity,adult,original_language,status,title,video,year,month,day
2,2,-0.174232,-0.656286,0.770107,-0.134109,-0.242450,0.156412,3,26,4,3036,0,1988.0,10.0,21.0
3,3,-0.174232,-0.473500,0.770107,-0.152429,-0.242450,-0.104768,3,26,4,27311,0,1986.0,10.0,16.0
5,5,-0.107386,0.100970,0.458269,0.873476,-0.012873,1.016712,3,20,4,11966,0,1995.0,12.0,9.0
6,6,0.014443,0.414317,0.406296,-0.062865,-0.242450,0.435876,3,20,4,16770,0,1993.0,10.0,15.0
11,11,11.879695,0.701552,1.289838,13.573113,0.388885,6.532646,3,20,4,28841,0,1977.0,5.0,25.0
12,12,14.443726,0.153194,1.029973,12.583848,5.152594,3.759634,3,20,4,11441,0,2003.0,5.0,30.0
13,13,10.364748,1.249909,1.341811,16.359746,2.914225,7.558043,3,20,4,11913,0,1994.0,7.0,6.0
14,14,5.364566,0.727664,1.185892,6.774461,0.618461,2.965091,3,20,4,2384,0,1999.0,9.0,15.0
15,15,0.186698,0.649327,1.237865,2.308520,-0.194254,2.146662,3,20,4,7036,0,1941.0,4.0,30.0
16,16,0.448083,1.197684,1.081946,0.574254,0.492194,1.292853,3,20,4,8213,0,2000.0,5.0,17.0


In [37]:
movies_new_features, feature_defs = ft.dfs(entityset = es, 
                                    target_entity = 'movies_entity_id',
                                    trans_primitives = ['add', 'multiply'])

movies_new_features.head()

Unnamed: 0_level_0,revenue,runtime,vote_average,vote_count,budget,popularity,adult,original_language,status,title,video,year,month,day,revenue + runtime,vote_average + vote_count,revenue + vote_average,budget + vote_average,runtime + vote_average,popularity + vote_average,budget + popularity,revenue + vote_count,runtime + vote_count,popularity + runtime,popularity + revenue,budget + revenue,popularity + vote_count,budget + vote_count,budget + runtime,revenue * vote_average,budget * vote_average,budget * revenue,budget * runtime,popularity * vote_average,popularity * vote_count,runtime * vote_average,revenue * vote_count,revenue * runtime,vote_average * vote_count,budget * popularity,...,budget + popularity * revenue,budget + popularity * runtime,popularity + vote_count * vote_count,popularity + runtime * revenue + vote_count,popularity + vote_average * vote_average,revenue + vote_average * vote_count,budget + vote_count * popularity + runtime,budget + vote_count * popularity + vote_average,popularity + vote_count * runtime + vote_count,vote_average + vote_count * vote_count,budget + popularity * popularity + runtime,budget + vote_average * revenue,budget + runtime * revenue + vote_count,revenue + vote_average * revenue + vote_count,budget + popularity * vote_count,revenue + runtime * vote_count,budget + popularity * popularity + vote_count,revenue + vote_count * vote_count,budget + runtime * runtime + vote_count,budget * budget + revenue,budget + revenue * runtime + vote_count,budget + vote_count * revenue,popularity + revenue * runtime + vote_count,revenue + runtime * revenue + vote_average,revenue + vote_count * runtime + vote_count,budget + runtime * vote_count,budget + popularity * revenue + vote_average,budget + revenue * revenue + vote_count,popularity + vote_average * vote_average + vote_count,budget + popularity * popularity + vote_average,popularity + revenue * runtime,budget + vote_count * revenue + runtime,budget + runtime * vote_average,revenue * vote_average + vote_count,budget + revenue * runtime,budget + vote_count * popularity + vote_count,budget + runtime * popularity,budget + popularity * budget + vote_average,budget + vote_average * vote_average,budget + popularity * revenue + vote_count
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1,Unnamed: 26_level_1,Unnamed: 27_level_1,Unnamed: 28_level_1,Unnamed: 29_level_1,Unnamed: 30_level_1,Unnamed: 31_level_1,Unnamed: 32_level_1,Unnamed: 33_level_1,Unnamed: 34_level_1,Unnamed: 35_level_1,Unnamed: 36_level_1,Unnamed: 37_level_1,Unnamed: 38_level_1,Unnamed: 39_level_1,Unnamed: 40_level_1,Unnamed: 41_level_1,Unnamed: 42_level_1,Unnamed: 43_level_1,Unnamed: 44_level_1,Unnamed: 45_level_1,Unnamed: 46_level_1,Unnamed: 47_level_1,Unnamed: 48_level_1,Unnamed: 49_level_1,Unnamed: 50_level_1,Unnamed: 51_level_1,Unnamed: 52_level_1,Unnamed: 53_level_1,Unnamed: 54_level_1,Unnamed: 55_level_1,Unnamed: 56_level_1,Unnamed: 57_level_1,Unnamed: 58_level_1,Unnamed: 59_level_1,Unnamed: 60_level_1,Unnamed: 61_level_1,Unnamed: 62_level_1,Unnamed: 63_level_1,Unnamed: 64_level_1,Unnamed: 65_level_1,Unnamed: 66_level_1,Unnamed: 67_level_1,Unnamed: 68_level_1,Unnamed: 69_level_1,Unnamed: 70_level_1,Unnamed: 71_level_1,Unnamed: 72_level_1,Unnamed: 73_level_1,Unnamed: 74_level_1,Unnamed: 75_level_1,Unnamed: 76_level_1,Unnamed: 77_level_1,Unnamed: 78_level_1,Unnamed: 79_level_1,Unnamed: 80_level_1,Unnamed: 81_level_1
2,-0.174232,-0.656286,0.770107,-0.134109,-0.24245,0.156412,3,26,4,3036,0,1988.0,10.0,21.0,-0.830517,0.635998,0.595876,0.527657,0.113822,0.926519,-0.086038,-0.30834,-0.790394,-0.499874,-0.01782,-0.416681,0.022303,-0.376559,-0.898735,-0.134177,-0.186712,0.042242,0.159116,0.120454,-0.020976,-0.50541,0.023366,0.114346,-0.103278,-0.037922,...,0.014991,0.056466,-0.002991,0.154131,0.713519,-0.079912,0.188232,-0.348889,-0.017628,-0.085293,0.043008,-0.091935,0.277116,-0.183733,0.011538,0.11138,-0.001919,0.041351,0.710355,0.101024,0.329343,0.065608,0.014085,-0.494885,0.243711,0.120528,-0.051268,0.12848,0.589264,-0.079716,0.011695,0.312738,-0.692123,-0.110811,0.273462,-0.008398,-0.140573,-0.045399,0.406353,0.026529
3,-0.174232,-0.4735,0.770107,-0.152429,-0.24245,-0.104768,3,26,4,27311,0,1986.0,10.0,16.0,-0.647731,0.617679,0.595876,0.527657,0.296607,0.665339,-0.347218,-0.32666,-0.625928,-0.578268,-0.279,-0.416681,-0.257197,-0.394878,-0.71595,-0.134177,-0.186712,0.042242,0.1148,-0.080683,0.01597,-0.364646,0.026558,0.082499,-0.117386,0.025401,...,0.060496,0.164408,0.039204,0.188897,0.512382,-0.090828,0.228346,-0.262728,0.160987,-0.094152,0.200785,-0.091935,0.233872,-0.194649,0.052926,0.098733,0.089303,0.049792,0.448133,0.101024,0.260813,0.0688,0.174634,-0.385967,0.204466,0.109131,-0.206899,0.136113,0.410966,-0.231018,0.132106,0.255775,-0.551358,-0.107619,0.197299,0.101561,0.075009,-0.183212,0.406353,0.113422
5,-0.107386,0.10097,0.458269,0.873476,-0.012873,1.016712,3,20,4,11966,0,1995.0,12.0,9.0,-0.006416,1.331744,0.350883,0.445395,0.559238,1.474981,1.003839,0.76609,0.974446,1.117682,0.909326,-0.12026,1.890188,0.860602,0.088096,-0.049212,-0.0059,0.001382,-0.0013,0.465927,0.888074,0.046271,-0.093799,-0.010843,0.400287,-0.013089,...,-0.107798,0.101357,1.651033,0.856245,0.675938,0.306487,0.96188,1.269372,1.841885,1.163247,1.121973,-0.047829,0.06749,0.268808,0.876829,-0.005604,1.897444,0.669161,0.085845,0.001548,-0.117186,-0.092417,0.886089,-0.002251,0.746513,0.07695,0.35223,-0.09213,1.964298,1.480643,0.091814,-0.005522,0.040372,-0.143011,-0.012143,1.6267,0.089569,0.447105,0.204111,0.769031
6,0.014443,0.414317,0.406296,-0.062865,-0.24245,0.435876,3,20,4,16770,0,1993.0,10.0,15.0,0.42876,0.34343,0.420738,0.163846,0.820612,0.842172,0.193426,-0.048423,0.351451,0.850193,0.450319,-0.228007,0.37301,-0.305315,0.171867,0.005868,-0.098506,-0.003502,-0.100451,0.177094,-0.027402,0.168335,-0.000908,0.005984,-0.025542,-0.105678,...,0.002794,0.08014,-0.023449,-0.041169,0.342171,-0.02645,-0.259577,-0.257128,0.131095,-0.02159,0.164449,0.002366,-0.008322,-0.020373,-0.01216,-0.026954,0.07215,0.003044,0.060403,0.05528,-0.080133,-0.00441,0.158265,0.180396,-0.017018,-0.010804,0.081382,0.011041,0.289227,0.162898,0.186575,-0.130907,0.069829,0.00496,-0.094467,-0.113886,0.074913,0.031692,0.06657,-0.009366
11,11.879695,0.701552,1.289838,13.573113,0.388885,6.532646,3,20,4,28841,0,1977.0,5.0,25.0,12.581246,14.862951,13.169533,1.678723,1.99139,7.822484,6.921531,25.452807,14.274664,7.234197,18.41234,12.26858,20.105758,13.961998,1.090437,15.322882,0.501599,4.619836,0.272823,8.426055,88.668337,0.904888,161.244432,8.334218,17.507117,2.540448,...,82.225673,4.855811,272.897723,184.130631,10.089737,178.751549,101.003847,109.217501,287.002949,201.736503,50.071721,19.942718,27.754674,335.201574,93.946718,170.76667,139.162627,345.473818,15.565617,4.771067,175.129854,165.864268,262.829975,165.689131,363.330275,14.800619,91.153326,312.269791,116.265192,54.143564,12.917206,175.659329,1.406487,176.567314,8.607041,280.716552,7.123436,11.619334,2.165281,176.17239


In [38]:
feature_defs

[<Feature: revenue>,
 <Feature: runtime>,
 <Feature: vote_average>,
 <Feature: vote_count>,
 <Feature: budget>,
 <Feature: popularity>,
 <Feature: adult>,
 <Feature: original_language>,
 <Feature: status>,
 <Feature: title>,
 <Feature: video>,
 <Feature: year>,
 <Feature: month>,
 <Feature: day>,
 <Feature: revenue + runtime>,
 <Feature: vote_average + vote_count>,
 <Feature: revenue + vote_average>,
 <Feature: budget + vote_average>,
 <Feature: runtime + vote_average>,
 <Feature: popularity + vote_average>,
 <Feature: budget + popularity>,
 <Feature: revenue + vote_count>,
 <Feature: runtime + vote_count>,
 <Feature: popularity + runtime>,
 <Feature: popularity + revenue>,
 <Feature: budget + revenue>,
 <Feature: popularity + vote_count>,
 <Feature: budget + vote_count>,
 <Feature: budget + runtime>,
 <Feature: revenue * vote_average>,
 <Feature: budget * vote_average>,
 <Feature: budget * revenue>,
 <Feature: budget * runtime>,
 <Feature: popularity * vote_average>,
 <Feature: popula

In [39]:
ratings.head()

Unnamed: 0,userId,movieId,rating,timestamp
0,1,110,1.0,1425941529
1,1,147,4.5,1425942435
2,1,858,5.0,1425941523
3,1,1221,5.0,1425941546
4,1,1246,5.0,1425941556


In [40]:
ratings.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 26024289 entries, 0 to 26024288
Data columns (total 4 columns):
userId       int64
movieId      int64
rating       float64
timestamp    int64
dtypes: float64(1), int64(3)
memory usage: 794.2 MB


In [53]:
es.entity_from_dataframe(entity_id = 'ratings_entity_id', dataframe = ratings, 
                         make_index = True, index = 'rating_id',
                         variable_types = {'userId': ft.variable_types.Categorical, 'timestamp': ft.variable_types.Categorical})

Entityset: movies_entitySet
  Entities:
    movies_entity_id [Rows: 45434, Columns: 15]
    ratings_entity_id [Rows: 26024289, Columns: 5]
  Relationships:
    ratings_entity_id.movieId -> movies_entity_id.movieId

In [54]:
# add relationships
movies_ratings_relation = ft.Relationship(es['movies_entity_id']['movieId'],
                                          es['ratings_entity_id']['movieId'])

es = es.add_relationship(movies_ratings_relation)



In [55]:
es

Entityset: movies_entitySet
  Entities:
    movies_entity_id [Rows: 45434, Columns: 15]
    ratings_entity_id [Rows: 26024289, Columns: 5]
  Relationships:
    ratings_entity_id.movieId -> movies_entity_id.movieId

In [56]:
es['ratings_entity_id']

Entity: ratings_entity_id
  Variables:
    rating_id (dtype: index)
    movieId (dtype: ordinal)
    rating (dtype: numeric)
    userId (dtype: categorical)
    timestamp (dtype: categorical)
  Shape:
    (Rows: 26024289, Columns: 5)

In [0]:
df, features = ft.dfs(entityset = es, target_entity = 'movies_entity_id', max_depth = 2)

# df, features = ft.dfs(entityset = es, target_entity = 'movies_entity_id', agg_primitives = ['mean'])

In [58]:
df.head(5)

Unnamed: 0_level_0,revenue,runtime,vote_average,vote_count,budget,popularity,adult,original_language,status,title,video,year,month,day,SUM(ratings_entity_id.rating),STD(ratings_entity_id.rating),MAX(ratings_entity_id.rating),SKEW(ratings_entity_id.rating),MIN(ratings_entity_id.rating),MEAN(ratings_entity_id.rating),COUNT(ratings_entity_id),NUM_UNIQUE(ratings_entity_id.userId),NUM_UNIQUE(ratings_entity_id.timestamp),MODE(ratings_entity_id.userId),MODE(ratings_entity_id.timestamp)
movieId,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1,Unnamed: 22_level_1,Unnamed: 23_level_1,Unnamed: 24_level_1,Unnamed: 25_level_1
2,-0.174232,-0.656286,0.770107,-0.134109,-0.24245,0.156412,3,26,4,3036,0,1988.0,10.0,21.0,84355.0,0.958437,5.0,-0.318964,0.5,3.236953,26060.0,26060.0,26058.0,24.0,849122404.0
3,-0.174232,-0.4735,0.770107,-0.152429,-0.24245,-0.104768,3,26,4,27311,0,1986.0,10.0,16.0,49211.5,1.013307,5.0,-0.241727,0.5,3.17555,15497.0,15497.0,15495.0,46.0,860156485.0
5,-0.107386,0.10097,0.458269,0.873476,-0.012873,1.016712,3,20,4,11966,0,1995.0,12.0,9.0,46988.0,0.993108,5.0,-0.243825,0.5,3.079565,15258.0,15258.0,15257.0,2.0,848992200.0
6,0.014443,0.414317,0.406296,-0.062865,-0.24245,0.435876,3,20,4,16770,0,1993.0,10.0,15.0,107166.0,0.883211,5.0,-0.589678,0.5,3.841764,27895.0,27895.0,27893.0,15.0,844965565.0
11,11.879695,0.701552,1.289838,13.573113,0.388885,6.532646,3,20,4,28841,0,1977.0,5.0,25.0,71290.0,0.906568,5.0,-0.502945,0.5,3.660591,19475.0,19475.0,19470.0,6.0,822873600.0


In [59]:
features

[<Feature: revenue>,
 <Feature: runtime>,
 <Feature: vote_average>,
 <Feature: vote_count>,
 <Feature: budget>,
 <Feature: popularity>,
 <Feature: adult>,
 <Feature: original_language>,
 <Feature: status>,
 <Feature: title>,
 <Feature: video>,
 <Feature: year>,
 <Feature: month>,
 <Feature: day>,
 <Feature: SUM(ratings_entity_id.rating)>,
 <Feature: STD(ratings_entity_id.rating)>,
 <Feature: MAX(ratings_entity_id.rating)>,
 <Feature: SKEW(ratings_entity_id.rating)>,
 <Feature: MIN(ratings_entity_id.rating)>,
 <Feature: MEAN(ratings_entity_id.rating)>,
 <Feature: COUNT(ratings_entity_id)>,
 <Feature: NUM_UNIQUE(ratings_entity_id.userId)>,
 <Feature: NUM_UNIQUE(ratings_entity_id.timestamp)>,
 <Feature: MODE(ratings_entity_id.userId)>,
 <Feature: MODE(ratings_entity_id.timestamp)>]