In [1]:
import numpy as np
import pandas as pd
import seaborn as sns

In [2]:
df=sns.load_dataset('flights')
df.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [3]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 144 entries, 0 to 143
Data columns (total 3 columns):
 #   Column      Non-Null Count  Dtype   
---  ------      --------------  -----   
 0   year        144 non-null    int64   
 1   month       144 non-null    category
 2   passengers  144 non-null    int64   
dtypes: category(1), int64(2)
memory usage: 2.9 KB


In [4]:
df.describe()

Unnamed: 0,year,passengers
count,144.0,144.0
mean,1954.5,280.298611
std,3.464102,119.966317
min,1949.0,104.0
25%,1951.75,180.0
50%,1954.5,265.5
75%,1957.25,360.5
max,1960.0,622.0


In [5]:
df.isnull().sum()

year          0
month         0
passengers    0
dtype: int64

In [6]:
num_cols=df.select_dtypes(include='number').columns

In [7]:
num_cols

Index(['year', 'passengers'], dtype='object')

In [8]:
df[num_cols].skew()

year          0.00000
passengers    0.58316
dtype: float64

In [9]:
df.isnull().sum()

year          0
month         0
passengers    0
dtype: int64

In [10]:
threshold=df.shape[0]*0.6
df=df.loc[:,df.isnull().sum()<threshold]

df.head()

Unnamed: 0,year,month,passengers
0,1949,Jan,112
1,1949,Feb,118
2,1949,Mar,132
3,1949,Apr,129
4,1949,May,121


In [11]:
num_cols=df.select_dtypes(include=['number']).columns
cat_cols=df.select_dtypes(include=['object']).columns

In [12]:
for col in num_cols:
    df[col]=df[col].fillna(df[col].mean())

for col in cat_cols:
    df[col]=df[col].fillna(df[col].mode())


In [13]:
#handel outliers

for col in num_cols:
    Q1=df[col].quantile(0.25)
    Q3=df[col].quantile(0.75)
    IQR=Q3-Q1

    lower=Q1-1.5*IQR
    upper=Q3+1.5*IQR
    df[col] = df[col].clip(lower, upper)

    print(col, (df[col]<lower).sum(), (df[col]>upper).sum())

year 0 0
passengers 0 0


In [14]:
#apply feature scaling

from sklearn.preprocessing import StandardScaler

scaler=StandardScaler()
df[num_cols]=scaler.fit_transform(df[num_cols])
df[num_cols]

Unnamed: 0,year,passengers
0,-1.593255,-1.407779
1,-1.593255,-1.357590
2,-1.593255,-1.240483
3,-1.593255,-1.265578
4,-1.593255,-1.332496
...,...,...
139,1.593255,2.724417
140,1.593255,1.904669
141,1.593255,1.511525
142,1.593255,0.917627


In [15]:
#feature engineering
df['is_summer_peek'] = df['month'].isin(['Jul','Jun','Aug']).astype('int')
df['is_summer_peek']


0      0
1      0
2      0
3      0
4      0
      ..
139    1
140    0
141    0
142    0
143    0
Name: is_summer_peek, Length: 144, dtype: int64

In [16]:
#encode categorical values

df=pd.get_dummies(df,drop_first=True)
df

Unnamed: 0,year,passengers,is_summer_peek,month_Feb,month_Mar,month_Apr,month_May,month_Jun,month_Jul,month_Aug,month_Sep,month_Oct,month_Nov,month_Dec
0,-1.593255,-1.407779,0,False,False,False,False,False,False,False,False,False,False,False
1,-1.593255,-1.357590,0,True,False,False,False,False,False,False,False,False,False,False
2,-1.593255,-1.240483,0,False,True,False,False,False,False,False,False,False,False,False
3,-1.593255,-1.265578,0,False,False,True,False,False,False,False,False,False,False,False
4,-1.593255,-1.332496,0,False,False,False,True,False,False,False,False,False,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
139,1.593255,2.724417,1,False,False,False,False,False,False,True,False,False,False,False
140,1.593255,1.904669,0,False,False,False,False,False,False,False,True,False,False,False
141,1.593255,1.511525,0,False,False,False,False,False,False,False,False,True,False,False
142,1.593255,0.917627,0,False,False,False,False,False,False,False,False,False,True,False


In [17]:
corr=df.corr()['is_summer_peek'].abs()
corr=corr.sort_values(ascending=False)
print(corr)

is_summer_peek    1.000000e+00
month_Jul         5.222330e-01
month_Jun         5.222330e-01
month_Aug         5.222330e-01
passengers        2.787973e-01
month_Nov         1.740777e-01
month_Sep         1.740777e-01
month_Oct         1.740777e-01
month_Dec         1.740777e-01
month_Feb         1.740777e-01
month_Mar         1.740777e-01
month_Apr         1.740777e-01
month_May         1.740777e-01
year              1.593566e-16
Name: is_summer_peek, dtype: float64


In [18]:
selected_feature=corr[corr>0.5].index.tolist()
df=df[selected_feature]
df

Unnamed: 0,is_summer_peek,month_Jul,month_Jun,month_Aug
0,0,False,False,False
1,0,False,False,False
2,0,False,False,False
3,0,False,False,False
4,0,False,False,False
...,...,...,...,...
139,1,False,False,True
140,0,False,False,False
141,0,False,False,False
142,0,False,False,False


In [20]:
from sklearn.linear_model import LassoCV

X=df.drop('is_summer_peek',axis=1)
y=df['is_summer_peek']

lasso=LassoCV(cv=5).fit(X,y)
importance=pd.Series(np.abs(lasso.coef_),index=X.columns)
selected_lasso=importance[importance>0].index.tolist()
df=df[['is_summer_peek']+selected_lasso]
df

Unnamed: 0,is_summer_peek,month_Jul,month_Jun,month_Aug
0,0,False,False,False
1,0,False,False,False
2,0,False,False,False
3,0,False,False,False
4,0,False,False,False
...,...,...,...,...
139,1,False,False,True
140,0,False,False,False
141,0,False,False,False
142,0,False,False,False
