In [2]:
%matplotlib inline
import numpy as np
import pandas as pd
import matplotlib.pyplot as pyplot

In [3]:
from sklearn.model_selection import train_test_split

x = pd.read_csv('vol/intermediate_result/x.csv')
y = x['worldwide_gross']
x = x.drop('worldwide_gross',axis=1)
x_train, x_test, y_train, y_test = train_test_split(x,y)


In [4]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(x_train)

StandardScaler(copy=True, with_mean=True, with_std=True)

In [5]:
scaler.mean_

array([3.30057883e+07, 2.00211374e+03, 2.12580834e+00, 1.08650573e+02,
       4.03947428e+07, 6.46611436e+00, 4.74184257e+07])

In [6]:
scaler.transform(x_train)

array([[-0.67648015, -2.42680087,  0.31245916, ..., -0.15293064,
         1.06714314,  0.01443147],
       [ 0.66389838, -0.4262602 ,  0.31245916, ...,  0.15166193,
        -0.15633658,  0.00528916],
       [ 3.61519056,  0.49065427,  0.31245916, ...,  0.61183777,
         1.82005373,  2.82553517],
       ...,
       [-0.61499489,  0.32394255,  0.31245916, ..., -0.14197407,
         0.50246019,  0.01443147],
       [-0.61499489,  0.99078944,  0.31245916, ..., -0.14197407,
        -0.34456423, -0.34231168],
       [-0.80068036,  0.32394255, -0.38439807, ..., -0.17484378,
        -0.2504504 , -0.75921883]])

In [7]:
x_train_scaled, x_test_scaled = (scaler.transform(x_train),scaler.transform(x_test))

In [8]:
from sklearn.linear_model import Lasso

model = Lasso()
model_scaled = Lasso()

model.fit(x_train,y_train)
model_scaled.fit(x_train_scaled,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [9]:
print(model.score(x_test,y_test))
print(model_scaled.score(x_test_scaled,y_test))

0.8371017974170494
0.8371017985597671


# Pipelines

In [10]:
from sklearn.pipeline import make_pipeline
model_scaled = make_pipeline(StandardScaler(),Lasso())

model_scaled.fit(x_train, y_train)

Pipeline(memory=None,
         steps=[('standardscaler',
                 StandardScaler(copy=True, with_mean=True, with_std=True)),
                ('lasso',
                 Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=None, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)

In [11]:
print(model_scaled.score(x_test,y_test))

0.8371017985597671


# Features automaticamente

In [12]:
A = np.arange(6).reshape(3,2)
A

array([[0, 1],
       [2, 3],
       [4, 5]])

In [13]:
from sklearn.preprocessing import PolynomialFeatures

transformer = PolynomialFeatures(2)
transformer.fit(A)
transformer.transform(A)


array([[ 1.,  0.,  1.,  0.,  0.,  1.],
       [ 1.,  2.,  3.,  4.,  6.,  9.],
       [ 1.,  4.,  5., 16., 20., 25.]])

In [14]:
x.shape

(4104, 7)

In [15]:
transformer = PolynomialFeatures(2)
transformer.fit_transform(x).shape

(4104, 36)

In [16]:
model_poly = make_pipeline(PolynomialFeatures(2), Lasso())

model_poly.fit(x_train, y_train)
model_poly.score(x_test, y_test)

0.8853928390720787

In [17]:
x.shape

(4104, 7)

In [18]:
transformer = PolynomialFeatures(2)
transformer.fit_transform(x).shape

(4104, 36)

In [19]:
model_poly = make_pipeline(PolynomialFeatures(2), Lasso())

model_poly.fit(x_train, y_train)
model_poly.score(x_test, y_test)

0.8853928390720787

In [22]:
d = pd.DataFrame([['Chile','Colombia', 'Venezuela'],['hombre','mujer','hombre','mujer']])
d = d.T
d.columns = pd.Index(['pais','genero'])
d

Unnamed: 0,pais,genero
0,Chile,hombre
1,Colombia,mujer
2,Venezuela,hombre
3,,mujer


In [23]:
pd.get_dummies(d)

Unnamed: 0,pais_Chile,pais_Colombia,pais_Venezuela,genero_hombre,genero_mujer
0,1,0,0,1,0
1,0,1,0,0,1
2,0,0,1,1,0
3,0,0,0,0,1


In [24]:
movies_obj = pd.read_csv('vol/intermediate_result/movies_obj.csv')

In [27]:
movies_obj.apply(pd.Series.nunique).sort_values()

color                2
content_rating      18
language            47
country             65
genres             914
actor_1_name      2097
director_name     2398
actor_2_name      3032
actor_3_name      3521
plot_keywords     4760
movie_title       4917
dtype: int64

In [29]:
!pip install category_encoders

Collecting category_encoders
  Downloading https://files.pythonhosted.org/packages/a0/52/c54191ad3782de633ea3d6ee3bb2837bda0cf3bc97644bb6375cf14150a0/category_encoders-2.1.0-py2.py3-none-any.whl (100kB)
You should consider upgrading via the 'python -m pip install --upgrade pip' command.
Collecting statsmodels>=0.6.1 (from category_encoders)
  Downloading https://files.pythonhosted.org/packages/21/8e/198d8d276cf8ab753679ab3db558675355e0dd286b8243f1c8a67d4f99db/statsmodels-0.11.1-cp38-none-win32.whl (7.8MB)
Collecting patsy>=0.4.1 (from category_encoders)
  Downloading https://files.pythonhosted.org/packages/ea/0c/5f61f1a3d4385d6bf83b83ea495068857ff8dfb89e74824c6e9eb63286d8/patsy-0.5.1-py2.py3-none-any.whl (231kB)
Installing collected packages: patsy, statsmodels, category-encoders
Successfully installed category-encoders-2.1.0 patsy-0.5.1 statsmodels-0.11.1


In [30]:
categoricals = pd.read_csv('vol/intermediate_result/categoricals.csv').set_index('Unnamed: 0')

In [31]:
categoricals.head(2)

Unnamed: 0_level_0,actor_1_name,director_name
Unnamed: 0,Unnamed: 1_level_1,Unnamed: 2_level_1
0,CCH Pounder,James Cameron
1,Doug Walker,Doug Walker


In [33]:
categoricals = categoricals.reset_index(drop=True).fillna(0)

In [44]:
x_binenc = pd.concat([x,categoricals],axis=1)

In [45]:
x_binenc

Unnamed: 0,production_budget,title_year,aspect_ratio,duration.1,budget,imdb_score,gross,actor_1_name,director_name
0,425000000.0,2009.000000,1.780000,178.000000,2.370000e+08,7.9,7.605058e+08,CCH Pounder,James Cameron
1,306000000.0,2002.130733,2.126976,108.577186,4.045539e+07,7.1,4.831933e+07,Doug Walker,Doug Walker
2,300000000.0,2007.000000,2.350000,169.000000,3.000000e+08,7.1,3.094042e+08,Johnny Depp,Gore Verbinski
3,300000000.0,2015.000000,2.350000,148.000000,2.450000e+08,6.8,2.000742e+08,Christoph Waltz,Sam Mendes
4,275000000.0,2012.000000,2.350000,164.000000,2.500000e+08,8.5,4.481306e+08,Tom Hardy,Christopher Nolan
...,...,...,...,...,...,...,...,...,...
4099,7000.0,2004.000000,1.850000,77.000000,7.000000e+03,7.0,4.247600e+05,Shane Carruth,Shane Carruth
4100,7000.0,2005.000000,2.126976,80.000000,7.000000e+03,6.3,7.007100e+04,Ian Gamazon,Neill Dela Llana
4101,7000.0,2005.000000,2.126976,84.000000,3.250000e+03,7.8,4.831933e+07,Richard Jewell,Anthony Vallone
4102,3967.0,2012.000000,2.350000,100.000000,4.045539e+07,6.3,1.044300e+04,Alan Ruck,Daniel Hsia


In [36]:
import category_encoders as ce
encoder = ce.BinaryEncoder(cols=['actor_1_name','director_name'])

In [47]:
 x_binenc = encoder.fit_transform(x_binenc)

In [48]:
xb_train, xb_test, y_train, y_test = train_test_split(x_binenc,y)

In [49]:
x_train, x_test =(xb_train[x.columns],xb_test[x.columns])

In [50]:
model_binenc = Lasso()
model = Lasso()

In [51]:
model_binenc.fit(xb_train,y_train)
model.fit(x_train,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [52]:
print(model_binenc.score(xb_test,y_test))
print(model.score(x_test,y_test))

0.8538586709161654
0.8544291594357292


# Conocimiento experto