### Feature Selection

In this section we will continute data preprocessing steps in order to facilitate feature selection critera

In [242]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn import (
    linear_model, metrics, neural_network, pipeline, model_selection
)
from sklearn.impute import SimpleImputer

from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso

In [185]:
# read in full netflix dataset (2019 - 2022)
netflix = pd.read_csv("netflix_Cleaned_Data.csv")

In [189]:
# Change "date" column to ensure datetime type and drop weekends
netflix["date"] = pd.to_datetime(netflix["date"])

In [288]:
# Drop first 14 dates
df = netflix.iloc[14:-1, :].reset_index(drop = True)
df.shape

(800, 165)

In [290]:
df.replace([np.inf, -np.inf], np.nan, inplace=True)

In [297]:
# Drop remanining NaN or null values
df = df.dropna()
df.shape

(721, 165)

In [298]:
# Create X feature dataset
X = df.drop(["date", "target_1", "target_2", "target_3", "target_4", "target_5"], axis=1)

# Create y target dataset
y = df["target_1"]

In [299]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [300]:
pipeline = Pipeline([
    ('scaler', StandardScaler()),
    ('model', Lasso())
])

In [301]:
search = GridSearchCV(pipeline,
                       {'model__alpha': np.arange(0.1, 10, 0.1)},
                        cv=5,
scoring = 'neg_mean_squared_error', verbose = 3
                          )

In [302]:
search.fit(X_train, y_train)

search.best_params_

Fitting 5 folds for each of 99 candidates, totalling 495 fits
[CV 1/5] END .................model__alpha=0.1;, score=-0.252 total time=   0.0s
[CV 2/5] END .................model__alpha=0.1;, score=-0.250 total time=   0.0s
[CV 3/5] END .................model__alpha=0.1;, score=-0.243 total time=   0.0s
[CV 4/5] END .................model__alpha=0.1;, score=-0.246 total time=   0.0s
[CV 5/5] END .................model__alpha=0.1;, score=-0.245 total time=   0.0s
[CV 1/5] END .................model__alpha=0.2;, score=-0.252 total time=   0.0s
[CV 2/5] END .................model__alpha=0.2;, score=-0.250 total time=   0.0s
[CV 3/5] END .................model__alpha=0.2;, score=-0.243 total time=   0.0s
[CV 4/5] END .................model__alpha=0.2;, score=-0.246 total time=   0.0s
[CV 5/5] END .................model__alpha=0.2;, score=-0.245 total time=   0.0s
[CV 1/5] END .model__alpha=0.30000000000000004;, score=-0.252 total time=   0.0s
[CV 2/5] END .model__alpha=0.30000000000000004;

{'model__alpha': 0.1}

In [303]:
coefficients = search.best_estimator_.named_steps['model'].coef_

In [304]:
importance = np.abs(coefficients)
importance

array([0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0., 0.,
       0., 0., 0., 0., 0., 0.])