In [20]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn import datasets
import seaborn as sns
from sklearn.linear_model import LinearRegression
from scipy import stats
from sklearn.preprocessing import StandardScaler

In [2]:
data = pd.read_csv('iris-data.csv')
data

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,2.3,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [3]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 150 entries, 0 to 149
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sepal_length_cm  150 non-null    float64
 1   sepal_width_cm   150 non-null    float64
 2   petal_length_cm  150 non-null    float64
 3   petal_width_cm   145 non-null    float64
 4   class            150 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.0+ KB


In [4]:
nan_cols = data.isna().sum()
nan_cols

sepal_length_cm    0
sepal_width_cm     0
petal_length_cm    0
petal_width_cm     5
class              0
dtype: int64

In [5]:
nan_cols[nan_cols>0]

petal_width_cm    5
dtype: int64

In [6]:
# Dropeamos Valores Nulos

drop_data = data.dropna()

In [7]:
# Comprobamos que se hayan eliminado los Valores Nulos

nan_cols = drop_data.isna().sum()

nan_cols[nan_cols>0]

Series([], dtype: int64)

In [8]:
drop_data.info()

<class 'pandas.core.frame.DataFrame'>
Index: 145 entries, 0 to 149
Data columns (total 5 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   sepal_length_cm  145 non-null    float64
 1   sepal_width_cm   145 non-null    float64
 2   petal_length_cm  145 non-null    float64
 3   petal_width_cm   145 non-null    float64
 4   class            145 non-null    object 
dtypes: float64(4), object(1)
memory usage: 6.8+ KB


In [9]:
data = drop_data

In [10]:
data

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class
0,5.1,3.5,1.4,0.2,Iris-setosa
1,4.9,3.0,1.4,0.2,Iris-setosa
2,4.7,3.2,1.3,0.2,Iris-setosa
3,4.6,3.1,1.5,0.2,Iris-setosa
4,5.0,3.6,1.4,0.2,Iris-setosa
...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica
146,6.3,2.5,5.0,2.3,Iris-virginica
147,6.5,3.0,5.2,2.0,Iris-virginica
148,6.2,3.4,5.4,2.3,Iris-virginica


In [14]:
def missing_percentage(data):
    percent_missing = data.isnull().sum() * 100 / len(data)
    missing_values_df = pd.DataFrame({'column_name': data.columns,'percent_missing': percent_missing})
    return missing_values_df.reset_index(drop=True)

In [15]:
missing_percentage(data)

Unnamed: 0,column_name,percent_missing
0,sepal_length_cm,0.0
1,sepal_width_cm,0.0
2,petal_length_cm,0.0
3,petal_width_cm,0.0
4,class,0.0


In [16]:
# Hacer Encoding: Coger solo las categóricas y no las numéricas

cat_cols = ['class']
data_encoded = data[cat_cols]
data_encoded

Unnamed: 0,class
0,Iris-setosa
1,Iris-setosa
2,Iris-setosa
3,Iris-setosa
4,Iris-setosa
...,...
145,Iris-virginica
146,Iris-virginica
147,Iris-virginica
148,Iris-virginica


In [18]:
cat_cols = ['class']
titanic_one_hot_encoding = pd.get_dummies(data_encoded[cat_cols], 
                                          columns=['class'], 
                                          drop_first=True)
titanic_one_hot_encoding

Unnamed: 0,class_Iris-setossa,class_Iris-versicolor,class_Iris-virginica,class_versicolor
0,False,False,False,False
1,False,False,False,False
2,False,False,False,False
3,False,False,False,False
4,False,False,False,False
...,...,...,...,...
145,False,False,True,False
146,False,False,True,False
147,False,False,True,False
148,False,False,True,False


In [22]:
df = titanic_one_hot_encoding.astype(int)
df

Unnamed: 0,class_Iris-setossa,class_Iris-versicolor,class_Iris-virginica,class_versicolor
0,0,0,0,0
1,0,0,0,0
2,0,0,0,0
3,0,0,0,0
4,0,0,0,0
...,...,...,...,...
145,0,0,1,0
146,0,0,1,0
147,0,0,1,0
148,0,0,1,0


In [23]:
df_combined = pd.concat([data, df], axis=1)
df_combined

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class,class_Iris-setossa,class_Iris-versicolor,class_Iris-virginica,class_versicolor
0,5.1,3.5,1.4,0.2,Iris-setosa,0,0,0,0
1,4.9,3.0,1.4,0.2,Iris-setosa,0,0,0,0
2,4.7,3.2,1.3,0.2,Iris-setosa,0,0,0,0
3,4.6,3.1,1.5,0.2,Iris-setosa,0,0,0,0
4,5.0,3.6,1.4,0.2,Iris-setosa,0,0,0,0
...,...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,Iris-virginica,0,0,1,0
146,6.3,2.5,5.0,2.3,Iris-virginica,0,0,1,0
147,6.5,3.0,5.2,2.0,Iris-virginica,0,0,1,0
148,6.2,3.4,5.4,2.3,Iris-virginica,0,0,1,0


In [24]:
df_combined = df_combined.drop(columns = ['class'])
df_combined

Unnamed: 0,sepal_length_cm,sepal_width_cm,petal_length_cm,petal_width_cm,class_Iris-setossa,class_Iris-versicolor,class_Iris-virginica,class_versicolor
0,5.1,3.5,1.4,0.2,0,0,0,0
1,4.9,3.0,1.4,0.2,0,0,0,0
2,4.7,3.2,1.3,0.2,0,0,0,0
3,4.6,3.1,1.5,0.2,0,0,0,0
4,5.0,3.6,1.4,0.2,0,0,0,0
...,...,...,...,...,...,...,...,...
145,6.7,3.0,5.2,2.3,0,0,1,0
146,6.3,2.5,5.0,2.3,0,0,1,0
147,6.5,3.0,5.2,2.0,0,0,1,0
148,6.2,3.4,5.4,2.3,0,0,1,0


In [25]:
df_combined.to_csv('data_cleaning.csv', index=False)