## **Import library**

In [2]:
import numpy as np
import pandas as pd
import scipy.stats as stats

## **Data Loading**

In [3]:
df = pd.read_csv("supermarket_sales.csv")
df.head()

Unnamed: 0,Invoice ID,Branch,City,Customer type,Gender,Product line,Unit price,Quantity,Tax 5%,Total,Date,Time,Payment,cogs,gross margin percentage,gross income,Rating
0,750-67-8428,A,Yangon,Member,Female,Health and beauty,74.69,7,26.1415,548.9715,1/5/2019,13:08,Ewallet,522.83,4.761905,26.1415,9.1
1,226-31-3081,C,Naypyitaw,Normal,Female,Electronic accessories,15.28,5,3.82,80.22,3/8/2019,10:29,Cash,76.4,4.761905,3.82,9.6
2,631-41-3108,A,Yangon,Normal,Male,Home and lifestyle,46.33,7,16.2155,340.5255,3/3/2019,13:23,Credit card,324.31,4.761905,16.2155,7.4
3,123-19-1176,A,Yangon,Member,Male,Health and beauty,58.22,8,23.288,489.048,1/27/2019,20:33,Ewallet,465.76,4.761905,23.288,8.4
4,373-73-7910,A,Yangon,Normal,Male,Sports and travel,86.31,7,30.2085,634.3785,2/8/2019,10:37,Ewallet,604.17,4.761905,30.2085,5.3


## **Data Exploration**

In [4]:
df.isnull().sum()     #Menampilkan jumlah data yang kosong

Invoice ID                 0
Branch                     0
City                       0
Customer type              0
Gender                     0
Product line               0
Unit price                 0
Quantity                   0
Tax 5%                     0
Total                      0
Date                       0
Time                       0
Payment                    0
cogs                       0
gross margin percentage    0
gross income               0
Rating                     0
dtype: int64

Karena data yang diberikan sudah tidak ada nilai kosong, maka tidak perlu dilakukan ***Data Cleaning***

In [5]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1000 entries, 0 to 999
Data columns (total 17 columns):
 #   Column                   Non-Null Count  Dtype  
---  ------                   --------------  -----  
 0   Invoice ID               1000 non-null   object 
 1   Branch                   1000 non-null   object 
 2   City                     1000 non-null   object 
 3   Customer type            1000 non-null   object 
 4   Gender                   1000 non-null   object 
 5   Product line             1000 non-null   object 
 6   Unit price               1000 non-null   float64
 7   Quantity                 1000 non-null   int64  
 8   Tax 5%                   1000 non-null   float64
 9   Total                    1000 non-null   float64
 10  Date                     1000 non-null   object 
 11  Time                     1000 non-null   object 
 12  Payment                  1000 non-null   object 
 13  cogs                     1000 non-null   float64
 14  gross margin percentage  

In [6]:
#1.         
df_pivot = df.groupby(['Product line', 'City']).count()[['Gender']].reset_index()
df_pivot = df_pivot.pivot_table(index = 'Product line', columns= 'City', values = 'Gender').fillna(0)
df_pivot

City,Mandalay,Naypyitaw,Yangon
Product line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Electronic accessories,55,55,60
Fashion accessories,62,65,51
Food and beverages,50,66,58
Health and beauty,53,52,47
Home and lifestyle,50,45,65
Sports and travel,62,45,59


In [10]:
chisq, pvalue, df, expected = stats.chi2_contingency(df_pivot)
print(f'Observed chi2: {chisq:.2f}')
print(f'p-value: {pvalue:.4f}')

Observed chi2: 11.56
p-value: 0.3156


In [7]:
#2. 
df_pivot1 = df.groupby(['Product line', 'Customer type']).count()[['Gender']].reset_index()
df_pivot1 = df_pivot1.pivot_table(index = 'Product line', columns= 'Customer type', values = 'Gender').fillna(0)
df_pivot1

Customer type,Member,Normal
Product line,Unnamed: 1_level_1,Unnamed: 2_level_1
Electronic accessories,78,92
Fashion accessories,86,92
Food and beverages,94,80
Health and beauty,73,79
Home and lifestyle,83,77
Sports and travel,87,79


In [11]:
chisq, pvalue, df, expected = stats.chi2_contingency(df_pivot1)
print(f'Observed chi2: {chisq:.2f}')
print(f'p-value: {pvalue:.4f}')

Observed chi2: 3.33
p-value: 0.6500


In [8]:
#3. 
df_pivot2 = df.groupby(['Product line', 'Gender']).count()[['City']].reset_index()
df_pivot2 = df_pivot2.pivot_table(index = 'Product line', columns= 'Gender', values = 'City').fillna(0)

df_pivot2

Gender,Female,Male
Product line,Unnamed: 1_level_1,Unnamed: 2_level_1
Electronic accessories,84,86
Fashion accessories,96,82
Food and beverages,90,84
Health and beauty,64,88
Home and lifestyle,79,81
Sports and travel,88,78


In [12]:
chisq, pvalue, df, expected = stats.chi2_contingency(df_pivot2)
print(f'Observed chi2: {chisq:.2f}')
print(f'p-value: {pvalue:.4f}')

Observed chi2: 5.74
p-value: 0.3319


In [9]:
#4.
df_pivot3 = df.groupby(['Product line', 'Payment']).count()[['Gender']].reset_index()
df_pivot3 = df_pivot3.pivot_table(index = 'Product line', columns= 'Payment', values = 'Gender').fillna(0)
df_pivot3

Payment,Cash,Credit card,Ewallet
Product line,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
Electronic accessories,71,46,53
Fashion accessories,57,56,65
Food and beverages,57,61,56
Health and beauty,49,50,53
Home and lifestyle,51,45,64
Sports and travel,59,53,54


In [13]:
chisq, pvalue, df, expected = stats.chi2_contingency(df_pivot3)
print(f'Observed chi2: {chisq:.2f}')
print(f'p-value: {pvalue:.4f}')

Observed chi2: 8.72
p-value: 0.5587
