In [1]:
import numpy as np
import pandas as pd
from matplotlib import pyplot as plt
from sklearn.model_selection import train_test_split
%matplotlib inline

In [29]:
X = pd.read_csv('./intermediate_results/X.csv')
y = X['worldwide_gross']
X = X.drop('worldwide_gross',axis=1).drop('Unnamed: 0', axis=1)

X_train,X_test,y_train,y_test=train_test_split(X,y, random_state= 1)

# Scaling data

In [30]:
from sklearn.preprocessing import StandardScaler

scaler = StandardScaler()
scaler.fit(X_train) 

StandardScaler(copy=True, with_mean=True, with_std=True)

In [31]:
scaler.mean_

array([3.31409009e+07, 2.00221527e+03, 2.12904874e+00, 1.08643613e+02,
       1.01557141e+04, 3.78170542e+07, 6.44372969e+00])

In [32]:
scaler.scale_

array([4.06860372e+07, 1.20009435e+01, 7.61475103e-01, 2.29961880e+01,
       1.55561461e+04, 7.57783208e+07, 1.06889904e+00])

In [33]:
X.values

array([[4.25000000e+08, 2.00900000e+03, 1.78000000e+00, ...,
        4.83400000e+03, 2.37000000e+08, 7.90000000e+00],
       [3.06000000e+08, 2.00213073e+03, 2.12697615e+00, ...,
        1.43000000e+02, 4.04553863e+07, 7.10000000e+00],
       [3.00000000e+08, 2.00700000e+03, 2.35000000e+00, ...,
        4.83500000e+04, 3.00000000e+08, 7.10000000e+00],
       ...,
       [7.00000000e+03, 2.00500000e+03, 2.12697615e+00, ...,
        9.30000000e+01, 3.25000000e+03, 7.80000000e+00],
       [3.96700000e+03, 2.01200000e+03, 2.35000000e+00, ...,
        2.38600000e+03, 4.04553863e+07, 6.30000000e+00],
       [1.10000000e+03, 2.00400000e+03, 1.85000000e+00, ...,
        1.63000000e+02, 1.10000000e+03, 6.60000000e+00]])

In [34]:
scaler.transform(X_train)

array([[-0.61792454, -0.00704453, -0.0027218 , ...,  1.22397191,
         0.03481645,  1.1752937 ],
       [-0.51961071,  0.89865648,  0.29016216, ..., -0.40869468,
        -0.34069182,  1.26884791],
       [ 1.97510263,  0.48202257,  0.29016216, ..., -0.54941076,
         0.53026968, -1.25711564],
       ...,
       [ 0.41437064,  0.56534935, -0.36645813, ...,  1.2708987 ,
         0.16077086, -0.6022362 ],
       [-0.80594973,  0.23204222, -0.36645813, ..., -0.6518783 ,
        -0.475952  ,  1.26884791],
       [ 1.88907803,  0.89865648, -0.36645813, ..., -0.59408764,
         0.88657211, -0.97645302]])

In [35]:
X_train_scaled,X_test_scaled = (scaler.transform(X_train),scaler.transform(X_test))

In [36]:
from sklearn.linear_model import Lasso
model = Lasso()
model_scaled = Lasso()

model.fit(X_train,y_train)
model_scaled.fit(X_train_scaled,y_train)

Lasso(alpha=1.0, copy_X=True, fit_intercept=True, max_iter=1000,
      normalize=False, positive=False, precompute=False, random_state=None,
      selection='cyclic', tol=0.0001, warm_start=False)

In [37]:
print(model.score(X_test,y_test))
print(model_scaled.score(X_test_scaled,y_test))
# The models are not different, so it can still be work without scaling

0.5731124344715528
0.5731124378760191


# Simplifying operations with pipeline

In [38]:
from sklearn.pipeline import make_pipeline

model_scaled = make_pipeline(StandardScaler(),Lasso())

model_scaled.fit(X_train,y_train)
model.score(X_test,y_test) 
#making the same result but with pipeline

0.5731124344715528

# Creating new features with Polynomial features

In [39]:
from sklearn.preprocessing import PolynomialFeatures

transformer = PolynomialFeatures(2)

In [40]:
transformer = PolynomialFeatures()
transformer.fit_transform(X).shape

(4104, 36)

In [41]:
model_poly = make_pipeline(PolynomialFeatures(2),Lasso())

In [42]:
model_poly.fit(X_train,y_train)

  positive)


Pipeline(memory=None,
         steps=[('polynomialfeatures',
                 PolynomialFeatures(degree=2, include_bias=True,
                                    interaction_only=False, order='C')),
                ('lasso',
                 Lasso(alpha=1.0, copy_X=True, fit_intercept=True,
                       max_iter=1000, normalize=False, positive=False,
                       precompute=False, random_state=None, selection='cyclic',
                       tol=0.0001, warm_start=False))],
         verbose=False)

In [43]:
model_poly.score(X_test,y_test)
# Polynomial features did not give a better result, it's time to change the strategy

0.45885798596581706