<a href="https://colab.research.google.com/github/davidofitaly/machine-learning-notes/blob/main/supervised/01_basics/03_feature_extraction.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

####import libraries

In [1]:
import numpy as np
import pandas as pd
import sklearn

sklearn.__version__

'1.3.2'

###uploading of data

In [2]:
def fetch_financial_data(company='AMZN'):
    """
    This function fetches stock market quotations.
    """
    import pandas_datareader.data as web
    return web.DataReader(name=company, data_source='stooq')

df_raw = fetch_financial_data()
df_raw.head()

Unnamed: 0_level_0,Open,High,Low,Close,Volume
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
2024-09-13,187.0,188.5,185.91,186.49,26495351
2024-09-12,184.8,187.41,183.54,187.0,33622483
2024-09-11,180.095,184.99,175.73,184.52,42564698
2024-09-10,177.49,180.5,176.79,179.55,36233796
2024-09-09,174.53,175.85,173.51,175.4,29037362


###Create a copy of the data

In [3]:
df = df_raw.copy()
df = df[5:]
df.info()

<class 'pandas.core.frame.DataFrame'>
DatetimeIndex: 1253 entries, 2024-09-06 to 2019-09-16
Data columns (total 5 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   Open    1253 non-null   float64
 1   High    1253 non-null   float64
 2   Low     1253 non-null   float64
 3   Close   1253 non-null   float64
 4   Volume  1253 non-null   int64  
dtypes: float64(4), int64(1)
memory usage: 58.7 KB


###Generating new variables

In [4]:
df.index.month

Index([9, 9, 9, 9, 8, 8, 8, 8, 8, 8,
       ...
       9, 9, 9, 9, 9, 9, 9, 9, 9, 9],
      dtype='int32', name='Date', length=1253)

In [5]:
df['day'] = df.index.day
df['month'] = df.index.month
df['year'] = df.index.year
df

Unnamed: 0_level_0,Open,High,Low,Close,Volume,day,month,year
Date,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
2024-09-06,177.2400,178.3800,171.160,171.3900,41466537,6,9,2024
2024-09-05,175.0000,179.8750,174.995,177.8900,40170526,5,9,2024
2024-09-04,174.4800,175.9800,172.540,173.3300,30309225,4,9,2024
2024-09-03,177.5500,178.2600,175.260,176.2500,37817511,3,9,2024
2024-08-30,172.7800,178.9000,172.600,178.5000,43429355,30,8,2024
...,...,...,...,...,...,...,...,...
2019-09-20,91.0855,91.5315,89.046,89.7080,111116780,20,9,2019
2019-09-19,91.0510,91.6285,90.895,91.0750,41566700,19,9,2019
2019-09-18,90.8520,91.1030,89.775,90.8730,50720240,18,9,2019
2019-09-17,90.3540,91.1995,90.205,91.1275,40661160,17,9,2019


###Discretization of a continuous variable

In [6]:
df = pd.DataFrame(data={'height': [175., 178.5, 185., 191., 184.5, 183., 168.]})
df

Unnamed: 0,height
0,175.0
1,178.5
2,185.0
3,191.0
4,184.5
5,183.0
6,168.0


In [8]:
df['height_cut'] = pd.cut(x=df.height, bins=3)
df

Unnamed: 0,height,height_cut
0,175.0,"(167.977, 175.667]"
1,178.5,"(175.667, 183.333]"
2,185.0,"(183.333, 191.0]"
3,191.0,"(183.333, 191.0]"
4,184.5,"(183.333, 191.0]"
5,183.0,"(175.667, 183.333]"
6,168.0,"(167.977, 175.667]"


In [9]:
df['height_cut'] = pd.cut(x=df.height, bins=(160,175,180,195))
df

Unnamed: 0,height,height_cut
0,175.0,"(160, 175]"
1,178.5,"(175, 180]"
2,185.0,"(180, 195]"
3,191.0,"(180, 195]"
4,184.5,"(180, 195]"
5,183.0,"(180, 195]"
6,168.0,"(160, 175]"


In [10]:
df['height_cut'] = pd.cut(x=df.height, bins=(160,175,180,195), labels=['small', 'medium', 'high'])
df

Unnamed: 0,height,height_cut
0,175.0,small
1,178.5,medium
2,185.0,high
3,191.0,high
4,184.5,high
5,183.0,high
6,168.0,small


In [13]:
pd.get_dummies(df, drop_first=True, prefix='height', dtype=int)

Unnamed: 0,height,height_medium,height_high
0,175.0,0,0
1,178.5,1,0
2,185.0,0,1
3,191.0,0,1
4,184.5,0,1
5,183.0,0,1
6,168.0,0,0


###trait extraction

In [14]:
df = pd.DataFrame(data={'lang': [['PL', 'ENG'], ['GER', 'ENG', 'PL', 'FRA'], ['RUS']]})
df

Unnamed: 0,lang
0,"[PL, ENG]"
1,"[GER, ENG, PL, FRA]"
2,[RUS]


In [15]:
df['lang_number'] = df['lang'].apply(len)
df

Unnamed: 0,lang,lang_number
0,"[PL, ENG]",2
1,"[GER, ENG, PL, FRA]",4
2,[RUS],1


In [16]:
df['PL_flag'] = df['lang'].apply(lambda x:1 if 'PL' in x else 0)
df

Unnamed: 0,lang,lang_number,PL_flag
0,"[PL, ENG]",2,1
1,"[GER, ENG, PL, FRA]",4,1
2,[RUS],1,0


In [17]:
df = pd.DataFrame(data={'website': ['wp.pl', 'onet.pl', 'google.com']})
df

Unnamed: 0,website
0,wp.pl
1,onet.pl
2,google.com


In [19]:
df.website.str.split('.', expand=True)

Unnamed: 0,0,1
0,wp,pl
1,onet,pl
2,google,com


In [20]:
new = df.website.str.split('.', expand=True)
df['portal'] = new[0]
df['extension'] = new[1]
df

Unnamed: 0,website,portal,extension
0,wp.pl,wp,pl
1,onet.pl,onet,pl
2,google.com,google,com
