<a href="https://colab.research.google.com/github/davidlealo/sic_ai_2024/blob/main/002_preprocesamiento/Chapter_3/DLO_SIC_AI_Ch03_Unit03.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

## 3.3. Checking for Missing Data

In [1]:
import pandas as pd
import numpy as np

### Missing Value

In [2]:
import seaborn as sns
df = sns.load_dataset('titanic')
nan_deck = df['deck'].value_counts(dropna=False)
print(nan_deck)

deck
NaN    688
C       59
B       47
D       33
E       32
A       15
F       13
G        4
Name: count, dtype: int64


In [3]:
print(df.head().isnull())

   survived  pclass    sex    age  sibsp  parch   fare  embarked  class  \
0     False   False  False  False  False  False  False     False  False   
1     False   False  False  False  False  False  False     False  False   
2     False   False  False  False  False  False  False     False  False   
3     False   False  False  False  False  False  False     False  False   
4     False   False  False  False  False  False  False     False  False   

     who  adult_male   deck  embark_town  alive  alone  
0  False       False   True        False  False  False  
1  False       False  False        False  False  False  
2  False       False   True        False  False  False  
3  False       False  False        False  False  False  
4  False       False   True        False  False  False  


In [4]:
print(df.head().notnull())

   survived  pclass   sex   age  sibsp  parch  fare  embarked  class   who  \
0      True    True  True  True   True   True  True      True   True  True   
1      True    True  True  True   True   True  True      True   True  True   
2      True    True  True  True   True   True  True      True   True  True   
3      True    True  True  True   True   True  True      True   True  True   
4      True    True  True  True   True   True  True      True   True  True   

   adult_male   deck  embark_town  alive  alone  
0        True  False         True   True   True  
1        True   True         True   True   True  
2        True  False         True   True   True  
3        True   True         True   True   True  
4        True  False         True   True   True  


In [5]:
print(df.head().isnull().sum(axis=0))

survived       0
pclass         0
sex            0
age            0
sibsp          0
parch          0
fare           0
embarked       0
class          0
who            0
adult_male     0
deck           3
embark_town    0
alive          0
alone          0
dtype: int64


In [6]:
df_thresh = df.dropna(axis=1, thresh=500)
print(df_thresh.columns)

Index(['survived', 'pclass', 'sex', 'age', 'sibsp', 'parch', 'fare',
       'embarked', 'class', 'who', 'adult_male', 'embark_town', 'alive',
       'alone'],
      dtype='object')


In [7]:
df_age = df.dropna(subset=['age'], how='any', axis=0)
print(len(df_age))

714


### Replacing Missing Values

In [8]:
mean_age = df['age'].mean(axis=0)
df['age'].fillna(mean_age, inplace=True)
print(df['age'].head(10))

0    22.000000
1    38.000000
2    26.000000
3    35.000000
4    35.000000
5    29.699118
6    54.000000
7     2.000000
8    27.000000
9    14.000000
Name: age, dtype: float64


### Practice with Missing Data

In [9]:
df = sns.load_dataset('titanic')
print(df['embark_town'][825:830])
print('\n')

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829            NaN
Name: embark_town, dtype: object




In [10]:
most_freq = df['embark_town'].value_counts(dropna=True).idxmax()
print(most_freq)
print('\n')

Southampton




In [11]:
df['embark_town'].fillna(most_freq, inplace=True)
print(df['embark_town'][825:830])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829    Southampton
Name: embark_town, dtype: object


In [12]:
df = sns.load_dataset('titanic')
df['embark_town'].fillna(method='ffill', inplace=True)
print(df['embark_town'][825:830])

825     Queenstown
826    Southampton
827      Cherbourg
828     Queenstown
829     Queenstown
Name: embark_town, dtype: object


## 3.4. Checking and Processing Duplicate Data

### Processing Duplicate Data

In [13]:
import pandas as pd
df = pd.DataFrame({'c1':['a', 'a', 'b', 'a', 'b'],
                   'c2':[1, 1, 1, 2, 2],
                   'c3':[1, 1, 2, 2, 2]})
print(df)

  c1  c2  c3
0  a   1   1
1  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2


In [14]:
df_dup = df.duplicated()
print(df_dup)

0    False
1     True
2    False
3    False
4    False
dtype: bool


In [15]:
col_dup = df['c2'].duplicated()
print(col_dup)

0    False
1     True
2     True
3    False
4     True
Name: c2, dtype: bool


In [16]:
df2 = df.drop_duplicates()
print(df2)

  c1  c2  c3
0  a   1   1
2  b   1   2
3  a   2   2
4  b   2   2


In [17]:
df3 = df.drop_duplicates(subset=['c2', 'c3'])
print(df3)

  c1  c2  c3
0  a   1   1
2  b   1   2
3  a   2   2


## 3.5. Data Feature Engineering

### Matching the Same Measurement Unit Equally

In [19]:
!wget = https://raw.githubusercontent.com/davidlealo/sic_ai_2024/main/002_preprocesamiento/Chapter_3/auto-mpg.csv

--2024-07-23 00:45:25--  http://=/
Resolving = (=)... failed: Name or service not known.
wget: unable to resolve host address ‘=’
--2024-07-23 00:45:25--  https://raw.githubusercontent.com/davidlealo/sic_ai_2024/main/002_preprocesamiento/Chapter_3/auto-mpg.csv
Resolving raw.githubusercontent.com (raw.githubusercontent.com)... 185.199.108.133, 185.199.110.133, 185.199.111.133, ...
Connecting to raw.githubusercontent.com (raw.githubusercontent.com)|185.199.108.133|:443... connected.
HTTP request sent, awaiting response... 200 OK
Length: 17647 (17K) [text/plain]
Saving to: ‘auto-mpg.csv’


2024-07-23 00:45:25 (37.1 MB/s) - ‘auto-mpg.csv’ saved [17647/17647]

FINISHED --2024-07-23 00:45:25--
Total wall clock time: 0.2s
Downloaded: 1 files, 17K in 0s (37.1 MB/s)


In [20]:
df = pd.read_csv('./auto-mpg.csv', header=None)

df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
              'acceleration', 'model year', 'orign', 'name']
print(df.head(3))
print('\n')

    mpg  cylinders  displacement horsepower  weight  acceleration  model year  \
0  18.0          8         307.0        130    3504          12.0          70   
1  15.0          8         350.0        165    3693          11.5          70   
2  18.0          8         318.0        150    3436          11.0          70   

   orign                       name  
0      1  chevrolet chevelle malibu  
1      1          buick skylark 320  
2      1         plymouth satellite  




In [21]:
mpg_to_kpl = 1.60934 / 3.78541

df['kpl'] = df['mpg'] * mpg_to_kpl
print(df.head(3))
print('\n')

df['kpl'] = df['kpl'].round(2)
print(df.head(3))

    mpg  cylinders  displacement horsepower  weight  acceleration  model year  \
0  18.0          8         307.0        130    3504          12.0          70   
1  15.0          8         350.0        165    3693          11.5          70   
2  18.0          8         318.0        150    3436          11.0          70   

   orign                       name       kpl  
0      1  chevrolet chevelle malibu  7.652571  
1      1          buick skylark 320  6.377143  
2      1         plymouth satellite  7.652571  


    mpg  cylinders  displacement horsepower  weight  acceleration  model year  \
0  18.0          8         307.0        130    3504          12.0          70   
1  15.0          8         350.0        165    3693          11.5          70   
2  18.0          8         318.0        150    3436          11.0          70   

   orign                       name   kpl  
0      1  chevrolet chevelle malibu  7.65  
1      1          buick skylark 320  6.38  
2      1         plymout

### Data Type Conversion

In [22]:
print(df.dtypes)
print('\n')

print(df['horsepower'].unique())
print('\n')

mpg             float64
cylinders         int64
displacement    float64
horsepower       object
weight            int64
acceleration    float64
model year        int64
orign             int64
name             object
kpl             float64
dtype: object


['130' '165' '150' '140' '198' '220' '215' '225' '190' '170' '160' '95'
 '97' '85' '88' '46' '87' '90' '113' '200' '210' '193' '?' '100' '105'
 '175' '153' '180' '110' '72' '86' '70' '76' '65' '69' '60' '80' '54'
 '208' '155' '112' '92' '145' '137' '158' '167' '94' '107' '230' '49' '75'
 '91' '122' '67' '83' '78' '52' '61' '93' '148' '129' '96' '71' '98' '115'
 '53' '81' '79' '120' '152' '102' '108' '68' '58' '149' '89' '63' '48'
 '66' '139' '103' '125' '133' '138' '135' '142' '77' '62' '132' '84' '64'
 '74' '116' '82']




In [23]:
print(df['orign'].unique())

[1 3 2]


In [24]:
df['orign'].replace({1:'USA', 2:'EU', 3:'JAPAN'}, inplace=True)

print(df['orign'].unique())
print(df['orign'].dtypes)
print('\n')

['USA' 'JAPAN' 'EU']
object




In [25]:
df['orign'] = df['orign'].astype('category')
print(df['orign'].dtypes)

df['orign'] = df['orign'].astype('str')
print(df['orign'].dtypes)

category
object


In [26]:
print(df['model year'].sample(3))
df['model year'] = df['model year'].astype('category')
print(df['model year'].sample(3))

227    77
315    80
32     71
Name: model year, dtype: int64
178    75
241    77
215    76
Name: model year, dtype: category
Categories (13, int64): [70, 71, 72, 73, ..., 79, 80, 81, 82]


### Categorical Data Conversion (Division of Sections)

In [27]:
df = pd.read_csv('./auto-mpg.csv', header=None)

df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
              'acceleration', 'model year', 'orign', 'name']

df['horsepower'].replace('?', np.nan, inplace=True)
df.dropna(subset=['horsepower'], axis=0, inplace=True)
df['horsepower'] = df['horsepower'].astype('float')

In [28]:
count, bin_dividers = np.histogram(df['horsepower'], bins=3)
print(bin_dividers)

[ 46.         107.33333333 168.66666667 230.        ]


In [29]:
bin_names = ['Low output', 'Normal output', 'High output']

df['hp_bin'] = pd.cut(x=df['horsepower'],
                     bins=bin_dividers,
                     labels=bin_names,
                     include_lowest=True)

print(df[['horsepower', 'hp_bin']].head(15))

    horsepower         hp_bin
0        130.0  Normal output
1        165.0  Normal output
2        150.0  Normal output
3        150.0  Normal output
4        140.0  Normal output
5        198.0    High output
6        220.0    High output
7        215.0    High output
8        225.0    High output
9        190.0    High output
10       170.0    High output
11       160.0  Normal output
12       150.0  Normal output
13       225.0    High output
14        95.0     Low output


### Categorical Data Conversion (Divisions of Sections)

In [30]:
df = pd.read_csv('./auto-mpg.csv', header=None)

df.columns = ['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
              'acceleration', 'model year', 'orign', 'name']

df['horsepower'].replace('?', np.nan, inplace=True)
df.dropna(subset=['horsepower'], axis=0, inplace=True)
df['horsepower'] = df['horsepower'].astype('float')

count, bin_dividers = np.histogram(df['horsepower'], bins=3)
bin_names = ['Low output', 'Normal output', 'High output']

### One-Hot Vector

In [31]:
df['hp_bin'] = pd.cut(x=df['horsepower'],
                     bins=bin_dividers,
                     labels=bin_names,
                     include_lowest=True)

horsepower_dummies = pd.get_dummies(df['hp_bin']).astype('int')
print(horsepower_dummies.head(15))

    Low output  Normal output  High output
0            0              1            0
1            0              1            0
2            0              1            0
3            0              1            0
4            0              1            0
5            0              0            1
6            0              0            1
7            0              0            1
8            0              0            1
9            0              0            1
10           0              0            1
11           0              1            0
12           0              1            0
13           0              0            1
14           1              0            0


# Get Dummies

En la documentación [link](https://pandas.pydata.org/docs/reference/api/pandas.get_dummies.html) encontramos el parámetro drop_first que yo lo recomiendo

In [38]:
horsepower_dummies_1 = pd.get_dummies(df['hp_bin'],drop_first = True)
print(horsepower_dummies_1)

     Normal output  High output
0             True        False
1             True        False
2             True        False
3             True        False
4             True        False
..             ...          ...
393          False        False
394          False        False
395          False        False
396          False        False
397          False        False

[392 rows x 2 columns]


In [39]:
df.columns

Index(['mpg', 'cylinders', 'displacement', 'horsepower', 'weight',
       'acceleration', 'model year', 'orign', 'name', 'hp_bin'],
      dtype='object')

### Normalization

In [32]:
print(df.horsepower.describe())
print('\n')

df.horsepower = df.horsepower / abs(df.horsepower.max())

print(df.horsepower.head())
print('\n')
print(df.horsepower.describe())

count    392.000000
mean     104.469388
std       38.491160
min       46.000000
25%       75.000000
50%       93.500000
75%      126.000000
max      230.000000
Name: horsepower, dtype: float64


0    0.565217
1    0.717391
2    0.652174
3    0.652174
4    0.608696
Name: horsepower, dtype: float64


count    392.000000
mean       0.454215
std        0.167353
min        0.200000
25%        0.326087
50%        0.406522
75%        0.547826
max        1.000000
Name: horsepower, dtype: float64
