<a href="https://colab.research.google.com/github/filiptomczak/colab/blob/main/supervised/01_basic/features_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import numpy as np
import pandas as pd
import sklearn


In [5]:
def fetch_fin_data(company='AMZN'):
  import pandas_datareader.data as web
  return web.DataReader(name=company,data_source='stooq')

df_raw=fetch_fin_data()
df_raw.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2025-04-09,172.115,192.65,169.93,191.1,116804328
2025-04-08,185.23,185.9,168.57,170.66,87710360
2025-04-07,162.0,183.4099,161.38,175.26,109327115
2025-04-04,167.145,178.1436,166.0,171.0,123159359
2025-04-03,182.995,184.13,176.92,178.41,95553617


In [6]:
df=df_raw.copy()
df=df[:5]
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 5 entries, 2025-04-09 to 2025-04-03
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    5 non-null      float64
 1   High    5 non-null      float64
 2   Low     5 non-null      float64
 3   Close   5 non-null      float64
 4   Volume  5 non-null      int64  
dtypes: float64(4), int64(1)
memory usage: 240.0 bytes


In [7]:
df.index

DatetimeIndex(['2025-04-09', '2025-04-08', '2025-04-07', '2025-04-04',
               '2025-04-03'],
              dtype='datetime64[ns]', name='Date', freq=None)

In [8]:
df['day']=df.index.day
df['month']=df.index.month
df['year']=df.index.year

df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2025-04-09,172.115,192.65,169.93,191.1,116804328,9,4,2025
2025-04-08,185.23,185.9,168.57,170.66,87710360,8,4,2025
2025-04-07,162.0,183.4099,161.38,175.26,109327115,7,4,2025
2025-04-04,167.145,178.1436,166.0,171.0,123159359,4,4,2025
2025-04-03,182.995,184.13,176.92,178.41,95553617,3,4,2025


In [10]:
df=pd.DataFrame(data={'height':[172.,183.,199.,185.,167.,158.5,177.5,186.5,197.5]})
df


Unnamed: 0,height
0,172.0
1,183.0
2,199.0
3,185.0
4,167.0
5,158.5
6,177.5
7,186.5
8,197.5


In [11]:
df['height_cat']=pd.cut(x=df.height,bins=3)
df

Unnamed: 0,height,height_cat
0,172.0,"(158.46, 172.0]"
1,183.0,"(172.0, 185.5]"
2,199.0,"(185.5, 199.0]"
3,185.0,"(172.0, 185.5]"
4,167.0,"(158.46, 172.0]"
5,158.5,"(158.46, 172.0]"
6,177.5,"(172.0, 185.5]"
7,186.5,"(185.5, 199.0]"
8,197.5,"(185.5, 199.0]"


In [16]:
df['height_cat']=pd.cut(x=df.height,bins=(130,175,185,220),labels=['s','m','l'])
df

Unnamed: 0,height,height_cat
0,172.0,s
1,183.0,m
2,199.0,l
3,185.0,m
4,167.0,s
5,158.5,s
6,177.5,m
7,186.5,l
8,197.5,l


In [18]:
pd.get_dummies(df,drop_first=True,prefix='height')

Unnamed: 0,height,height_m,height_l
0,172.0,False,False
1,183.0,True,False
2,199.0,False,True
3,185.0,True,False
4,167.0,False,False
5,158.5,False,False
6,177.5,True,False
7,186.5,False,True
8,197.5,False,True


In [19]:
df = pd.DataFrame(data={'lang': [['PL', 'ENG'], ['GER', 'ENG', 'PL', 'FRA'], ['RUS']]})
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,[RUS]


In [31]:
df['lang_count']=df['lang'].apply(len)
df['is_PL']=df['lang'].apply(lambda x: 1 if 'PL'in x else 0)
df

Unnamed: 0,lang,lang_count,is_PL
0,"[PL, ENG]",2,1
1,"[GER, ENG, PL, FRA]",4,1
2,[RUS],1,0


In [29]:
df['lang_count']=[len(i) for i in df['lang']]
df['is_PL']=['PL' in i for i in df['lang']]
df

Unnamed: 0,lang,lang_count,is_PL
0,"[PL, ENG]",2,True
1,"[GER, ENG, PL, FRA]",4,True
2,[RUS],1,False


In [32]:
df = pd.DataFrame(data={'website': ['wp.pl', 'onet.pl', 'google.com']})
df

Unnamed: 0,website
0,wp.pl
1,onet.pl
2,google.com


In [36]:
new_col=df.website.str.split('.',expand=True)
df['portal']=new_col[0]
df['extension']=new_col[1]
df

Unnamed: 0,website,portal,extension
0,wp.pl,wp,pl
1,onet.pl,onet,pl
2,google.com,google,com
