<a href="https://colab.research.google.com/github/czokw1/ml-bootcamp/blob/main/supervised/01_basics/03_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Import Libraries

In [1]:
import pandas as pd
import numpy as np

import sklearn

#Get data

In [6]:
def fetch_financial_data(company='AMZN'):
  import pandas_datareader.data as web
  return web.DataReader(name=company,data_source='stooq')

df_raw = fetch_financial_data()

df_raw.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-03-25,3280.0,3307.37,3245.0,3295.47,2454275
2022-03-24,3274.99,3282.37,3201.0,3272.99,2839903
2022-03-23,3274.1,3327.4,3253.74,3268.16,2790593
2022-03-22,3236.11,3323.34,3233.98,3297.78,3204306
2022-03-21,3222.415,3261.68,3191.06,3229.83,3326876


#Copy raw data

In [8]:
df = df_raw.copy()
df = df[:5]
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2022-03-25 to 2022-03-21
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5 non-null      float64
 1   High    5 non-null      float64
 2   Low     5 non-null      float64
 3   Close   5 non-null      float64
 4   Volume  5 non-null      int64  
dtypes: float64(4), int64(1)
memory usage: 240.0 bytes


#Creating new variables

In [9]:
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-03-25,3280.0,3307.37,3245.0,3295.47,2454275,25,3,2022
2022-03-24,3274.99,3282.37,3201.0,3272.99,2839903,24,3,2022
2022-03-23,3274.1,3327.4,3253.74,3268.16,2790593,23,3,2022
2022-03-22,3236.11,3323.34,3233.98,3297.78,3204306,22,3,2022
2022-03-21,3222.415,3261.68,3191.06,3229.83,3326876,21,3,2022


#Creating constant variables

In [10]:
df = pd.DataFrame(data={'height':[175.,178.5,185.,184.5,183.,168.]})
df

Unnamed: 0,height
0,175.0
1,178.5
2,185.0
3,184.5
4,183.0
5,168.0


In [11]:
df['height_cat'] = pd.cut(x=df.height,bins=3)
df

Unnamed: 0,height,height_cat
0,175.0,"(173.667, 179.333]"
1,178.5,"(173.667, 179.333]"
2,185.0,"(179.333, 185.0]"
3,184.5,"(179.333, 185.0]"
4,183.0,"(179.333, 185.0]"
5,168.0,"(167.983, 173.667]"


In [12]:
df['height_cat'] = pd.cut(x=df.height,bins=(160,175,180,195))
df

Unnamed: 0,height,height_cat
0,175.0,"(160, 175]"
1,178.5,"(175, 180]"
2,185.0,"(180, 195]"
3,184.5,"(180, 195]"
4,183.0,"(180, 195]"
5,168.0,"(160, 175]"


In [14]:
df['height_cat'] = pd.cut(x=df.height,bins=(160,175,180,195), labels=['small','medium','heigh'])
df

Unnamed: 0,height,height_cat
0,175.0,small
1,178.5,medium
2,185.0,heigh
3,184.5,heigh
4,183.0,heigh
5,168.0,small


In [17]:
pd.get_dummies(df,drop_first=True,prefix='height')

Unnamed: 0,height,height_medium,height_heigh
0,175.0,0,0
1,178.5,1,0
2,185.0,0,1
3,184.5,0,1
4,183.0,0,1
5,168.0,0,0


#Features extraction

In [18]:
df = pd.DataFrame(data={'lang':[['PL','ENG'],['GER','ENG','PL','FRA'],['RUS']]})
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,[RUS]


In [21]:
df['lang_number'] = df['lang'].apply(len)
df

Unnamed: 0,lang,lang_number
0,"[PL, ENG]",2
1,"[GER, ENG, PL, FRA]",4
2,[RUS],1


In [22]:
df['PL_flag'] = df['lang'].apply(lambda x: 1 if 'PL' in x else 0)
df

Unnamed: 0,lang,lang_number,PL_flag
0,"[PL, ENG]",2,1
1,"[GER, ENG, PL, FRA]",4,1
2,[RUS],1,0


In [23]:
df = pd.DataFrame(data={'website':['wp.pl','onet.pl','google.com']})
df

Unnamed: 0,website
0,wp.pl
1,onet.pl
2,google.com


In [25]:
df.website.str.split('.',expand=True)

Unnamed: 0,0,1
0,wp,pl
1,onet,pl
2,google,com


In [29]:
new  = df.website.str.split('.',expand=True)
df['portal'] =new[0]
df['extension'] = new[1]
df

Unnamed: 0,website,portal,extension
0,wp.pl,wp,pl
1,onet.pl,onet,pl
2,google.com,google,com
