## Read Data

In [4]:
import pandas as pd
from myfunc62130500048 import missing_value_summary as miss

In [7]:
# df = pd.read_csv('/content/train.csv')
data = pd.read_csv('data-missing-example.csv')

In [9]:
data.head()

Unnamed: 0,ID,Gender,Depression Rating,Favorite Color
0,1,Male,6.0,Blue
1,2,Male,2.0,Green
2,3,Female,1.0,Red
3,4,Male,4.0,
4,5,Female,5.0,Yellow


## Check Missing Data

In [10]:
data.shape

(10, 4)

In [50]:
missing_value_summary(data)

Your data contain 4 columns.
There are missing values in 2 columns.


Unnamed: 0,Total Missing Values,% of Missing Values
Depression Rating,1,10.0
Favorite Color,1,10.0


In [11]:
miss(data)

Your data contain 4 columns.
There are missing values in 2 columns.


Unnamed: 0,Total Missing Values,% of Missing Values
Depression Rating,1,10.0
Favorite Color,1,10.0


## Handle Missing Values
* Deleting
* Imputation

### Listwise: Delete entire rows
* .dropna()

In [55]:
import pandas as pd
data = pd.read_csv('/content/data-missing-example.csv')
data

Unnamed: 0,ID,Gender,Depression Rating,Favorite Color
0,1,Male,6.0,Blue
1,2,Male,2.0,Green
2,3,Female,1.0,Red
3,4,Male,4.0,
4,5,Female,5.0,Yellow
5,6,Female,9.0,Purple
6,7,Male,3.0,Green
7,8,Female,4.0,Blue
8,9,Female,,Blue
9,10,Male,8.0,Red


In [56]:
data_nomiss = data.copy()

In [58]:
data_nomiss = data_nomiss.dropna()

In [59]:
data_nomiss.shape

(8, 4)

In [60]:
data.shape

(10, 4)

### Pairwise Deletion:

In [13]:
data['Depression Rating'].isna()

0    False
1    False
2    False
3    False
4    False
5    False
6    False
7    False
8     True
9    False
Name: Depression Rating, dtype: bool

In [12]:
data[data['Depression Rating'].isna()==False]

Unnamed: 0,ID,Gender,Depression Rating,Favorite Color
0,1,Male,6.0,Blue
1,2,Male,2.0,Green
2,3,Female,1.0,Red
3,4,Male,4.0,
4,5,Female,5.0,Yellow
5,6,Female,9.0,Purple
6,7,Male,3.0,Green
7,8,Female,4.0,Blue
9,10,Male,8.0,Red


In [14]:
data[data['Depression Rating'].isna()==False]['Depression Rating']

0    6.0
1    2.0
2    1.0
3    4.0
4    5.0
5    9.0
6    3.0
7    4.0
9    8.0
Name: Depression Rating, dtype: float64

In [15]:
data[data['Depression Rating'].isna()==False]['Depression Rating'].sum()

42.0

In [16]:
data['Depression Rating'].count()

9

In [17]:
val = round(data['Depression Rating'].mean(), 2)
val

4.67

In [19]:
data['Favorite Color']

0      Blue
1     Green
2       Red
3       NaN
4    Yellow
5    Purple
6     Green
7      Blue
8      Blue
9       Red
Name: Favorite Color, dtype: object

In [18]:
data['Favorite Color'].mode()

0    Blue
Name: Favorite Color, dtype: object

### Imputation

#### Impute with mean

In [20]:
import pandas as pd
data = pd.read_csv('data-missing-example.csv')
data

Unnamed: 0,ID,Gender,Depression Rating,Favorite Color
0,1,Male,6.0,Blue
1,2,Male,2.0,Green
2,3,Female,1.0,Red
3,4,Male,4.0,
4,5,Female,5.0,Yellow
5,6,Female,9.0,Purple
6,7,Male,3.0,Green
7,8,Female,4.0,Blue
8,9,Female,,Blue
9,10,Male,8.0,Red


In [21]:
mean_rating = round(data['Depression Rating'].mean(), 1)
mean_rating

4.7

In [22]:
data.columns

Index(['ID', 'Gender', 'Depression Rating', 'Favorite Color'], dtype='object')

In [23]:
#Global mean
data['Depression Rating'] = data['Depression Rating'].fillna(mean_rating)
data

Unnamed: 0,ID,Gender,Depression Rating,Favorite Color
0,1,Male,6.0,Blue
1,2,Male,2.0,Green
2,3,Female,1.0,Red
3,4,Male,4.0,
4,5,Female,5.0,Yellow
5,6,Female,9.0,Purple
6,7,Male,3.0,Green
7,8,Female,4.0,Blue
8,9,Female,4.7,Blue
9,10,Male,8.0,Red


#### Impute With Mode

In [24]:
color_mode = data['Favorite Color'].mode()[0]
color_mode

'Blue'

In [25]:
data['Favorite Color'] = data['Favorite Color'].fillna(color_mode)
data

Unnamed: 0,ID,Gender,Depression Rating,Favorite Color
0,1,Male,6.0,Blue
1,2,Male,2.0,Green
2,3,Female,1.0,Red
3,4,Male,4.0,Blue
4,5,Female,5.0,Yellow
5,6,Female,9.0,Purple
6,7,Male,3.0,Green
7,8,Female,4.0,Blue
8,9,Female,4.7,Blue
9,10,Male,8.0,Red


#### Impute with group mean (Gender)

In [26]:
import pandas as pd
data = pd.read_csv('data-missing-example.csv')
data

Unnamed: 0,ID,Gender,Depression Rating,Favorite Color
0,1,Male,6.0,Blue
1,2,Male,2.0,Green
2,3,Female,1.0,Red
3,4,Male,4.0,
4,5,Female,5.0,Yellow
5,6,Female,9.0,Purple
6,7,Male,3.0,Green
7,8,Female,4.0,Blue
8,9,Female,,Blue
9,10,Male,8.0,Red


In [27]:
data['Gender'].unique()

array(['Male', 'Female'], dtype=object)

In [28]:
m = data[data['Gender']=='Male']['Depression Rating'].mean()
m

4.6

In [87]:
data[data['Gender']=='Male']['Depression Rating'].fillna(m)

0    6.0
1    2.0
3    4.0
6    3.0
9    8.0
Name: Depression Rating, dtype: float64

# Plotly

In [29]:
import plotly.express as px
df = pd.read_csv('train.csv')
df.head(3)

Unnamed: 0,Item_Identifier,Item_Weight,Item_Fat_Content,Item_Visibility,Item_Type,Item_MRP,Outlet_Identifier,Outlet_Establishment_Year,Outlet_Size,Outlet_Location_Type,Outlet_Type,Item_Outlet_Sales
0,FDA15,9.3,Low Fat,0.016047,Dairy,249.8092,OUT049,1999,Medium,Tier 1,Supermarket Type1,3735.138
1,DRC01,5.92,Regular,0.019278,Soft Drinks,48.2692,OUT018,2009,Medium,Tier 3,Supermarket Type2,443.4228
2,FDN15,17.5,Low Fat,0.01676,Meat,141.618,OUT049,1999,Medium,Tier 1,Supermarket Type1,2097.27


In [32]:
fig = px.histogram(df, x='Item_Type', y='Item_Outlet_Sales', color = "Outlet_Size", facet_col = "Outlet_Size")
fig.show()