<a href="https://colab.research.google.com/github/gorzanskik-ai/machine-learning/blob/main/supervised-learning/01_basics/03_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [28]:
import sklearn
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt

In [29]:
import pandas_datareader.data as web

def get_financial_data(company):
    return web.DataReader(name=company, data_source='stooq')

df_raw = get_financial_data('AMZN')
df = df_raw.copy()
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2022-04-22,2965.00,2992.300,2873.9501,2887.00,3653929
2022-04-21,3094.28,3134.760,2951.7900,2965.92,3198466
2022-04-20,3152.05,3152.050,3072.0700,3079.96,2981501
2022-04-19,3040.59,3172.980,3031.0100,3162.31,2746321
2022-04-18,3030.47,3080.790,3005.0100,3055.70,2325676
...,...,...,...,...,...
2017-05-01,927.80,954.400,927.8000,948.23,5461391
2017-04-28,948.83,949.590,924.3340,924.99,7364681
2017-04-27,914.39,921.860,912.1100,918.38,5210059
2017-04-26,910.30,915.749,907.5600,909.29,2604617


In [30]:
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1259 entries, 2022-04-22 to 2017-04-25
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    1259 non-null   float64
 1   High    1259 non-null   float64
 2   Low     1259 non-null   float64
 3   Close   1259 non-null   float64
 4   Volume  1259 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 59.0 KB


In [31]:
#pobieranie danych z indeksu
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year

In [32]:
df.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2022-04-22,2965.0,2992.3,2873.9501,2887.0,3653929,22,4,2022
2022-04-21,3094.28,3134.76,2951.79,2965.92,3198466,21,4,2022
2022-04-20,3152.05,3152.05,3072.07,3079.96,2981501,20,4,2022
2022-04-19,3040.59,3172.98,3031.01,3162.31,2746321,19,4,2022
2022-04-18,3030.47,3080.79,3005.01,3055.7,2325676,18,4,2022


Dyskretyzacja ciągłej zmiennej

In [33]:
df = pd.DataFrame(data={'height': [175., 178.5, 185., 191., 184.5, 183., 168.]})
df

Unnamed: 0,height
0,175.0
1,178.5
2,185.0
3,191.0
4,184.5
5,183.0
6,168.0


In [34]:
df['height_cat'] = pd.cut(x=df['height'], bins=(160, 175, 185, 195), labels=['small', 'medium', 'high'])    #lub bins=liczba -> automatyczny podział
df

Unnamed: 0,height,height_cat
0,175.0,small
1,178.5,medium
2,185.0,medium
3,191.0,high
4,184.5,medium
5,183.0,medium
6,168.0,small


In [35]:
pd.get_dummies(df, drop_first=True, prefix='height')

Unnamed: 0,height,height_medium,height_high
0,175.0,0,0
1,178.5,1,0
2,185.0,1,0
3,191.0,0,1
4,184.5,1,0
5,183.0,1,0
6,168.0,0,0


Języki

In [36]:
df = pd.DataFrame({'lang': [['PL', 'ENG'], ['GER', 'ENG', 'PL', 'FRA'], ['RUS']]})
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,[RUS]


In [43]:
df['lang_number'] = df['lang'].apply(len)

In [44]:
df['PL_flag'] = df['lang'].apply(lambda x: 1 if 'PL' in x else 0)
df

Unnamed: 0,lang,PL_flag,lang_number
0,"[PL, ENG]",1,2
1,"[GER, ENG, PL, FRA]",1,4
2,[RUS],0,1


Website

In [45]:
df = pd.DataFrame({'website': ['wp.pl', 'onet.pl', 'google.com']})
df

Unnamed: 0,website
0,wp.pl
1,onet.pl
2,google.com


In [51]:
tmp = df['website'].str.split('.', expand=True)
df['portal'] = tmp[0]
df['extension'] = tmp[1]
df

Unnamed: 0,website,portal,extension
0,wp.pl,wp,pl
1,onet.pl,onet,pl
2,google.com,google,com


In [54]:
df.drop(['portal'], axis=1)

Unnamed: 0,website,extension
0,wp.pl,pl
1,onet.pl,pl
2,google.com,com
