### Feature Selection

In this section we will continute data preprocessing steps in order to facilitate feature selection critera

In [168]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn import (
    linear_model, metrics, neural_network, pipeline, model_selection
)
from sklearn.impute import SimpleImputer

from sklearn.decomposition import PCA

In [113]:
# read in full netflix dataset (2019 - 2022)
netflix_df = pd.read_csv("netflix_full_data.csv").reset_index(drop=True)

In [114]:
# Change "date" column to ensure datetime type and drop weekends
netflix_df["date"] = pd.to_datetime(netflix_df["date"])

In [129]:
from pandas.tseries.holiday import USFederalHolidayCalendar
from pandas.tseries.offsets import CustomBusinessDay

# create dateframe 2019-01-01 to 2022-03-31 which excludes weekends and holidays
us_bd = CustomBusinessDay(calendar=USFederalHolidayCalendar())
dates = pd.date_range(start='2019-01-01', end='2022-03-31', freq=us_bd)
dates = pd.DataFrame(dates)
dates = dates.rename(columns={0: "date"})


In [149]:
# Merge on new date range to remove weekends and holidays in netflix dataset
netflix_2 = dates.merge(netflix_df, on="date", how="left")
netflix_2.head()

Unnamed: 0,date,Netflix. Inc,Netflix_x,Netflix Stock,Streaming media,Reed Hastings_x,Open,High,Low,Close,...,Dow_MAvg_s_Move,Dow_EMA_Move,Dow_Disparity_Move,Dow_Disparity_s_Move,Dow_RSI_Move,target_1,target_2,target_3,target_4,target_5
0,2019-01-02,0.0,26.375,0.5,0.0,0.0,259.279999,269.75,256.579987,267.660004,...,0,0,0,0,0,1,1,1,1,1
1,2019-01-03,0.0,32.541667,0.333333,0.0,0.0,270.200012,275.790009,264.429993,271.200012,...,0,0,0,0,0,1,1,1,1,1
2,2019-01-04,0.0,29.708333,0.0,0.0,0.375,281.880005,297.799988,278.540009,297.570007,...,1,0,1,1,0,0,0,0,0,0
3,2019-01-07,0.0,21.5,0.291667,0.0,0.0,302.100006,316.799988,301.649994,315.339996,...,1,0,0,0,0,1,1,1,1,0
4,2019-01-08,0.0,25.0,1.0,0.0,0.0,319.980011,320.589996,308.01001,320.269989,...,1,1,1,1,0,0,0,0,0,0


In [177]:
# Create X feature dataset
X = netflix_2.drop(["date", "target_1", "target_2", "target_3", "target_4", "target_5"], axis=1)

# Create y target dataset
y = netflix_2["target_1"]

In [178]:
X

Unnamed: 0,Netflix. Inc,Netflix_x,Netflix Stock,Streaming media,Reed Hastings_x,Open,High,Low,Close,Volume,...,Dow_avg_loss,Dow_rs,Dow_RSI,Dow_Move,Dow_MAvg_Move,Dow_MAvg_s_Move,Dow_EMA_Move,Dow_Disparity_Move,Dow_Disparity_s_Move,Dow_RSI_Move
0,0.0,26.375000,0.500000,0.0,0.000000,259.279999,269.750000,256.579987,267.660004,11679500.0,...,,,,0,0,0,0,0,0,0
1,0.0,32.541667,0.333333,0.0,0.000000,270.200012,275.790009,264.429993,271.200012,14969600.0,...,,,,0,0,0,0,0,0,0
2,0.0,29.708333,0.000000,0.0,0.375000,281.880005,297.799988,278.540009,297.570007,19330100.0,...,,,,1,1,1,0,1,1,0
3,0.0,21.500000,0.291667,0.0,0.000000,302.100006,316.799988,301.649994,315.339996,18620100.0,...,,,,0,1,1,0,0,0,0
4,0.0,25.000000,1.000000,0.0,0.000000,319.980011,320.589996,308.010010,320.269989,15359200.0,...,,,,1,1,1,1,1,1,0
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
810,0.0,19.208333,0.541667,0.0,0.000000,377.070007,377.640015,366.429993,373.850006,3574500.0,...,,,,1,1,1,1,1,1,0
811,0.0,19.833333,0.416667,0.0,0.000000,375.230011,380.279999,366.730011,378.510010,4323400.0,...,,,,0,1,1,0,0,0,0
812,0.0,18.500000,0.000000,0.0,0.416667,384.390015,396.500000,380.329987,391.820007,5880700.0,...,,,,1,1,1,1,1,1,0
813,0.0,24.875000,0.416667,0.0,0.000000,389.549988,392.700012,378.630005,381.470001,4023300.0,...,,,,0,1,1,1,0,0,0


In [138]:
# Create lasso model
lasso_model = linear_model.Lasso()
lasso_model.fit(X, y)

lasso_coefs = lasso_model.coef_
lasso_coefs

ValueError: Input contains NaN, infinity or a value too large for dtype('float64').