### ADVANCED DATA ANALYTICS

#### TOP 8 POWERFUL FUNCTIONS IN PYTHON FOR DEEPER DATA ANALYTICS

1. GROUPBY
2. PIVOT_TABLE
3. AGGREGATE
4. CROSS_TAB
5. CONCAT
6. APPLY
7. LAMBDA
8. DEL(KEYWORD)

### GROUPBY

In [35]:
import pandas as pd

In [36]:
insurance_data = pd.read_csv(filepath_or_buffer="insurance.csv")

In [37]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
insurance_data.shape

(1338, 7)

In [5]:
insurance_data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [7]:
insurance_data.isnull().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [8]:
insurance_data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [9]:
insurance_data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


### Pull up the average insurance based on region

In [11]:
insurance_data['region'].unique()

array(['southwest', 'southeast', 'northwest', 'northeast'], dtype=object)

In [12]:
insurance_data['region'].nunique()

4

In [13]:
insurance_data.describe(include='all')

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
count,1338.0,1338,1338.0,1338.0,1338,1338,1338.0
unique,,2,,,2,4,
top,,male,,,no,southeast,
freq,,676,,,1064,364,
mean,39.207025,,30.663397,1.094918,,,13270.422265
std,14.04996,,6.098187,1.205493,,,12110.011237
min,18.0,,15.96,0.0,,,1121.8739
25%,27.0,,26.29625,0.0,,,4740.28715
50%,39.0,,30.4,1.0,,,9382.033
75%,51.0,,34.69375,2.0,,,16639.912515


In [18]:
insurance_data.groupby("region")["charges"].mean().round(2).sort_values() #ascending=True

region
southwest    12346.94
northwest    12417.58
northeast    13406.38
southeast    14735.41
Name: charges, dtype: float64

In [20]:
insurance_data.groupby("region")["charges"].mean().sort_values(ascending=False).round(2) #chain of methods

region
southeast    14735.41
northeast    13406.38
northwest    12417.58
southwest    12346.94
Name: charges, dtype: float64

#### discrete data and continuous data

1. discrete - countable/can't be segmented
2. continuous - measurable/have units/can also be segmented

In [21]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [22]:
#age,bmi,charges
#sex,children,smoker,region

### pull up the average insurance charges based on region and sex

In [26]:
insurance_data.groupby(by=["region","sex"])["charges"].mean().sort_values().round(2)

region     sex   
southwest  female    11274.41
northwest  male      12354.12
           female    12479.87
northeast  female    12953.20
southwest  male      13412.88
southeast  female    13499.67
northeast  male      13854.01
southeast  male      15879.62
Name: charges, dtype: float64

In [27]:
insurance_data.groupby(by=["region","sex"])["charges"].max().sort_values(ascending=False).round(2)

region     sex   
southeast  female    63770.43
           male      62592.87
northwest  male      60021.40
northeast  female    58571.07
northwest  female    55135.40
southwest  male      52590.83
           female    48824.45
northeast  male      48549.18
Name: charges, dtype: float64

### PIVOT TABLE

In [30]:
# It's used for more presentable comparing 2 categories features and 1 continuous data

In [29]:
pd.pivot_table(data=insurance_data,values="charges",index="region",columns="sex").round()

sex,female,male
region,Unnamed: 1_level_1,Unnamed: 2_level_1
northeast,12953.0,13854.0
northwest,12480.0,12354.0
southeast,13500.0,15880.0
southwest,11274.0,13413.0


In [33]:
pd.pivot_table(data=insurance_data,values="charges",index=["region"],columns=["sex","children"]).round()

sex,female,female,female,female,female,female,male,male,male,male,male,male
children,0,1,2,3,4,5,0,1,2,3,4,5
region,Unnamed: 1_level_2,Unnamed: 2_level_2,Unnamed: 3_level_2,Unnamed: 4_level_2,Unnamed: 5_level_2,Unnamed: 6_level_2,Unnamed: 7_level_2,Unnamed: 8_level_2,Unnamed: 9_level_2,Unnamed: 10_level_2,Unnamed: 11_level_2,Unnamed: 12_level_2
northeast,9790.0,17125.0,14523.0,13730.0,14542.0,,13390.0,15474.0,12318.0,14935.0,14442.0,6979.0
northwest,12584.0,11119.0,13128.0,13431.0,11024.0,8966.0,9813.0,9515.0,13763.0,22971.0,11508.0,
southeast,14862.0,9870.0,11766.0,19723.0,18267.0,9924.0,13792.0,17425.0,18648.0,17102.0,8727.0,10307.0
southwest,10150.0,10842.0,16230.0,8439.0,10946.0,10024.0,13727.0,9971.0,18876.0,12072.0,17924.0,6865.0


### AGGREGATE FUNCTION

In [34]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


### find out the average insurance charges based on region

In [35]:
insurance_data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [36]:
insurance_data.groupby("region")["charges"].aggregate(func=["mean","min","max","sum"]).round()

Unnamed: 0_level_0,mean,min,max,sum
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
northeast,13406.0,1695.0,58571.0,4343669.0
northwest,12418.0,1621.0,60021.0,4035712.0
southeast,14735.0,1122.0,63770.0,5363690.0
southwest,12347.0,1242.0,52591.0,4012755.0


In [37]:
insurance_data.groupby(["region","sex"])["charges"].aggregate(func=["mean","min","max","sum"]).round()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,min,max,sum
region,sex,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
northeast,female,12953.0,2196.0,58571.0,2085466.0
northeast,male,13854.0,1695.0,48549.0,2258203.0
northwest,female,12480.0,2117.0,55135.0,2046699.0
northwest,male,12354.0,1621.0,60021.0,1989013.0
southeast,female,13500.0,1608.0,63770.0,2362442.0
southeast,male,15880.0,1122.0,62593.0,3001248.0
southwest,female,11274.0,1728.0,48824.0,1826455.0
southwest,male,13413.0,1242.0,52591.0,2186300.0


### CROSS TAB

In [43]:
# compare with two categorical features and come up with frequency

In [38]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


#### how many smokers and non-smokers based on region

In [42]:
pd.crosstab(index=insurance_data["region"],columns=insurance_data["smoker"],margins=True)

smoker,no,yes,All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
northeast,257,67,324
northwest,267,58,325
southeast,273,91,364
southwest,267,58,325
All,1064,274,1338


### CONCAT

In [44]:
sales_data_2017 = pd.read_csv(filepath_or_buffer="2017.csv")

In [45]:
sales_data_2017.head()

Unnamed: 0,Date,Voucher,Party,Product,Qty,Rate,Gross,Disc,Voucher Amount
0,1/4/2017,Sal:1,SOLANKI PLASTICS,DONA-VAI-9100,2,1690.0,3380.0,,13100.0
1,1/4/2017,Sal:1,SOLANKI PLASTICS,LITE FOAM(1200),6,1620.0,9720.0,,
2,1/4/2017,Sal:2,SARNESWARA TRADERS,VISHNU CHOTA WINE,500,23.0,11500.0,,30990.0
3,1/4/2017,Sal:2,SARNESWARA TRADERS,LITE FOAM(1200),6,1620.0,9720.0,,
4,1/4/2017,Sal:2,SARNESWARA TRADERS,DONA-VAI-9100,5,1690.0,8450.0,,


In [46]:
sales_data_2018 = pd.read_csv(filepath_or_buffer="Sales-Transcations-2018.csv")

In [47]:
sales_data_2019 = pd.read_csv(filepath_or_buffer="Sales-Transcations-2019.csv")

In [48]:
sales_data_2018.head()

Unnamed: 0,Date,Voucher,Party,Product,Qty,Rate,Gross,Disc,Voucher Amount
0,1/4/2018,Sal:146,TP13,SILVER POUCH 9*12,50.0,85.0,4250.0,,66724.0
1,1/4/2018,Sal:146,TP13,RUBBER,5.0,290.0,1450.0,,
2,1/4/2018,Sal:146,TP13,DURGA 10*12 Blue,1600.0,5.5,8800.0,,
3,1/4/2018,Sal:146,TP13,DURGA 13*16 BLUE,400.0,11.0,4400.0,,
4,1/4/2018,Sal:146,TP13,10*12 SARAS-NAT,600.0,8.1,4860.0,,


In [50]:
sales_data_2019.head()

Unnamed: 0,Date,Voucher,Party,Product,Qty,Rate,Gross,Disc,Voucher Amount
0,1/4/2019,Sal:687,BALAJI PLASTICS,DONA-VAI-9100,1,1730.0,1730.0,,3460.0
1,1/4/2019,Sal:687,BALAJI PLASTICS,SMART BOUL(48),1,1730.0,1730.0,,
2,1/4/2019,Sal:688,BALAJI PLASTICS,Vishnu Ice,110,18.5,2035.0,,2035.0
3,,,28/3,,0,0.0,,,
4,1/4/2019,Sal:689,BALAJI PLASTICS,100LEAF -SP,3,585.0,1755.0,,1755.0


In [52]:
sales_edited_data = pd.read_csv("Sales-Transcations-Edited.csv")

In [55]:
sales_full_data = pd.concat(objs=[sales_data_2017,sales_data_2018,sales_data_2019])

In [56]:
sales_full_data.dtypes

Date              object
Voucher           object
Party             object
Product           object
Qty               object
Rate              object
Gross             object
Disc              object
Voucher Amount    object
dtype: object

In [57]:
sales_full_data.isna().sum()

Date               12591
Voucher            12557
Party                 40
Product            12591
Qty                12557
Rate               12558
Gross              12558
Disc              105609
Voucher Amount     83646
dtype: int64

### MERGE FUNCTION

In [59]:
#LEFT RIGHT INNER OUTER JOINS - TASK

### APPLY

In [60]:
insurance_data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


#### create a new column in the same dataframe with the name "eligibility criteria" where if age is > 20 say eligible

In [61]:
if insurance_data.age > 20:
    print("Eligible")
else:
    print("Not Eligible")

ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().

In [62]:
def get_age(x):
    if x>20:
        return "Eligible"
    else:
        return "Not Eligible"

In [63]:
insurance_data["Eligible Criteria"]= insurance_data["age"].apply(get_age)

In [64]:
insurance_data


Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Eligible Criteria
0,19,female,27.900,0,yes,southwest,16884.92400,Not Eligible
1,18,male,33.770,1,no,southeast,1725.55230,Not Eligible
2,28,male,33.000,3,no,southeast,4449.46200,Eligible
3,33,male,22.705,0,no,northwest,21984.47061,Eligible
4,32,male,28.880,0,no,northwest,3866.85520,Eligible
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,Eligible
1334,18,female,31.920,0,no,northeast,2205.98080,Not Eligible
1335,18,female,36.850,0,no,southeast,1629.83350,Not Eligible
1336,21,female,25.800,0,no,southwest,2007.94500,Eligible


### LAMBDA FUNCTION

In [66]:
# SINGLE EXPRESSION

In [68]:
# ALSO CALLED ANONYMOUS FUNCTION OR NAMELESS FUNCTION

In [69]:
insurance_data.age

0       19
1       18
2       28
3       33
4       32
        ..
1333    50
1334    18
1335    18
1336    21
1337    61
Name: age, Length: 1338, dtype: int64

In [70]:
lambda x: "Eligible" if x>20 else "Not Eligible"

<function __main__.<lambda>(x)>

In [71]:
insurance_data["Eligibilty_Criteria_2"] = insurance_data["age"].apply(lambda x: "Eligible" if x>20 else "Not Eligible")

In [72]:
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Eligibilty_Criteria_2
0,19,female,27.900,0,yes,southwest,16884.92400,Not Eligible
1,18,male,33.770,1,no,southeast,1725.55230,Not Eligible
2,28,male,33.000,3,no,southeast,4449.46200,Eligible
3,33,male,22.705,0,no,northwest,21984.47061,Eligible
4,32,male,28.880,0,no,northwest,3866.85520,Eligible
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,Eligible
1334,18,female,31.920,0,no,northeast,2205.98080,Not Eligible
1335,18,female,36.850,0,no,southeast,1629.83350,Not Eligible
1336,21,female,25.800,0,no,southwest,2007.94500,Eligible


In [73]:
# lambda function - map, query , filter

In [74]:
# explore - lanbda and list comprehension

### DELETE KEYWORD

In [75]:
del insurance_data["Eligibilty_Criteria_2"]

In [76]:
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [77]:
insurance_data.region

0       southwest
1       southeast
2       southeast
3       northwest
4       northwest
          ...    
1333    northwest
1334    northeast
1335    southeast
1336    southwest
1337    northwest
Name: region, Length: 1338, dtype: object

### the end!!!

### SELF EVALUATION

In [1]:
import pandas as pd

#### GROUP BY

The groupby() function in pandas is used to split data into groups based on some criteria, apply a function to each group, and then combine the results

In [2]:
insurance_data = pd.read_csv(filepath_or_buffer="insurance.csv")

In [3]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [4]:
insurance_data.shape

(1338, 7)

In [6]:
insurance_data.dtypes

age           int64
sex          object
bmi         float64
children      int64
smoker       object
region       object
charges     float64
dtype: object

In [9]:
insurance_data.isna().sum()

age         0
sex         0
bmi         0
children    0
smoker      0
region      0
charges     0
dtype: int64

In [10]:
insurance_data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1338 entries, 0 to 1337
Data columns (total 7 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   age       1338 non-null   int64  
 1   sex       1338 non-null   object 
 2   bmi       1338 non-null   float64
 3   children  1338 non-null   int64  
 4   smoker    1338 non-null   object 
 5   region    1338 non-null   object 
 6   charges   1338 non-null   float64
dtypes: float64(2), int64(2), object(3)
memory usage: 73.3+ KB


In [11]:
insurance_data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [12]:
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


continuous data = age, bmi, charges
discrete data = sex, children, smoker, region

In [17]:
insurance_data.groupby(by="region")["charges"].mean().sort_values().round(2)

region
southwest    12346.94
northwest    12417.58
northeast    13406.38
southeast    14735.41
Name: charges, dtype: float64

In [19]:
insurance_data.groupby(by="region")["charges"].min().round()

region
northeast    1695.0
northwest    1621.0
southeast    1122.0
southwest    1242.0
Name: charges, dtype: float64

In [21]:
insurance_data.groupby(by=["region","sex"])["charges"].max().round()

region     sex   
northeast  female    58571.0
           male      48549.0
northwest  female    55135.0
           male      60021.0
southeast  female    63770.0
           male      62593.0
southwest  female    48824.0
           male      52591.0
Name: charges, dtype: float64

In [26]:
insurance_data.groupby(by=["region","sex","smoker"])[["charges","age","bmi"]].mean()

Unnamed: 0_level_0,Unnamed: 1_level_0,Unnamed: 2_level_0,charges,age,bmi
region,sex,smoker,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
northeast,female,no,9640.426984,39.840909,29.777462
northeast,female,yes,28032.046398,38.724138,27.261724
northeast,male,no,8664.042222,39.216,28.86176
northeast,male,yes,30926.252583,37.868421,29.56
northwest,female,no,8786.998679,39.755556,29.488704
northwest,female,yes,29670.824946,38.827586,28.296897
northwest,male,no,8320.689321,38.568182,28.930379
northwest,male,yes,30713.181419,39.827586,29.983966
southeast,female,no,8440.205552,39.071942,32.78
southeast,female,yes,33034.820716,39.25,32.251389


#### PIVOT TABLE

It's a Advanced version of groupby() used for more presentable comparing 2 categories features and 1 continuous data

In [31]:
pd.pivot_table(data=insurance_data,values="charges",index="sex",columns="children")

children,0,1,2,3,4,5
sex,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
female,11905.714276,12161.360414,13941.317326,13865.605066,13937.674562,9854.006419
male,12832.696736,13273.522458,16187.095325,16789.167419,13782.284829,7931.65831


In [32]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


#### aggregate function

The aggregate() function in pandas is used to apply one or more aggregation operations across a DataFrame or Series — essentially a way to compute summary statistics flexibly


In [39]:
insurance_data.groupby(by=["region","sex"])["charges"].aggregate(func=["mean","sum"]).round()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,sum
region,sex,Unnamed: 2_level_1,Unnamed: 3_level_1
northeast,female,12953.0,2085466.0
northeast,male,13854.0,2258203.0
northwest,female,12480.0,2046699.0
northwest,male,12354.0,1989013.0
southeast,female,13500.0,2362442.0
southeast,male,15880.0,3001248.0
southwest,female,11274.0,1826455.0
southwest,male,13413.0,2186300.0


In [40]:
insurance_data.groupby(by=["children","smoker"])["bmi"].aggregate(func=["mean","sum","max","min"]).round()

Unnamed: 0_level_0,Unnamed: 1_level_0,mean,sum,max,min
children,smoker,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
0,no,31.0,14023.0,53.0,16.0
0,yes,31.0,3513.0,47.0,19.0
1,no,31.0,8039.0,50.0,17.0
1,yes,31.0,1883.0,53.0,20.0
2,no,31.0,5713.0,48.0,17.0
2,yes,31.0,1722.0,48.0,17.0
3,no,31.0,3627.0,47.0,19.0
3,yes,31.0,1190.0,42.0,20.0
4,no,32.0,697.0,41.0,24.0
4,yes,29.0,88.0,34.0,26.0


### CROSS TAB

It compare with two categorical features and come up with frequency

In [41]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [42]:
pd.crosstab(index=insurance_data["region"],columns=insurance_data["sex"],margins=True)

sex,female,male,All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
northeast,161,163,324
northwest,164,161,325
southeast,175,189,364
southwest,162,163,325
All,662,676,1338


In [43]:
pd.crosstab(index=insurance_data["region"],columns=insurance_data["smoker"],margins=True)

smoker,no,yes,All
region,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
northeast,257,67,324
northwest,267,58,325
southeast,273,91,364
southwest,267,58,325
All,1064,274,1338


### CONCAT

concat() function is used to combine (concatenate) multiple DataFrames or Series along a particular axis (rows or columns)

In [44]:
sales_2017 = pd.read_csv("2017.csv")
sales_2018 = pd.read_csv("Sales-Transcations-2018.csv")
sales_2019 = pd.read_csv("Sales-Transcations-2019.csv")

In [47]:
sales_2017

Unnamed: 0,Date,Voucher,Party,Product,Qty,Rate,Gross,Disc,Voucher Amount
0,1/4/2017,Sal:1,SOLANKI PLASTICS,DONA-VAI-9100,2,1690.00,3380.00,,13100.00
1,1/4/2017,Sal:1,SOLANKI PLASTICS,LITE FOAM(1200),6,1620.00,9720.00,,
2,1/4/2017,Sal:2,SARNESWARA TRADERS,VISHNU CHOTA WINE,500,23,11500.00,,30990.00
3,1/4/2017,Sal:2,SARNESWARA TRADERS,LITE FOAM(1200),6,1620.00,9720.00,,
4,1/4/2017,Sal:2,SARNESWARA TRADERS,DONA-VAI-9100,5,1690.00,8450.00,,
...,...,...,...,...,...,...,...,...,...
47285,31/03/2018,Sal:10042,Vkp,10*10 SHEET,25,137,3425.00,,3425.00
47286,,,,,,,,,
47287,,,,,,,,,
47288,,Total,,,607734.60,669300.49,9953816.13,106607.00,9868583.13


In [48]:
sales_2018

Unnamed: 0,Date,Voucher,Party,Product,Qty,Rate,Gross,Disc,Voucher Amount
0,1/4/2018,Sal:146,TP13,SILVER POUCH 9*12,50,85,4250.00,,66724.00
1,1/4/2018,Sal:146,TP13,RUBBER,5,290,1450.00,,
2,1/4/2018,Sal:146,TP13,DURGA 10*12 Blue,1600.00,5.5,8800.00,,
3,1/4/2018,Sal:146,TP13,DURGA 13*16 BLUE,400,11,4400.00,,
4,1/4/2018,Sal:146,TP13,10*12 SARAS-NAT,600,8.1,4860.00,,
...,...,...,...,...,...,...,...,...,...
44735,31/03/2019,Sal:9610,HAMPI FOODS,SPOON SOOFY,200,40,8000.00,,
44736,,,,,,,,,
44737,,,,,,,,,
44738,,Total,,,666056.00,1067808.80,10796991.30,29999.00,10787647.30


In [49]:
sales_2019

Unnamed: 0,Date,Voucher,Party,Product,Qty,Rate,Gross,Disc,Voucher Amount
0,1/4/2019,Sal:687,BALAJI PLASTICS,DONA-VAI-9100,1,1730.00,1730.00,,3460.00
1,1/4/2019,Sal:687,BALAJI PLASTICS,SMART BOUL(48),1,1730.00,1730.00,,
2,1/4/2019,Sal:688,BALAJI PLASTICS,Vishnu Ice,110,18.5,2035.00,,2035.00
3,,,28/3,,0,0,,,
4,1/4/2019,Sal:689,BALAJI PLASTICS,100LEAF -SP,3,585,1755.00,,1755.00
...,...,...,...,...,...,...,...,...,...
19171,10/10/2019,Sal:4935,K.SRIHARI,13*16 WHITE RK,400,16,6400.00,,
19172,,,,,,,,,
19173,,,,,,,,,
19174,,Total,,,99284.90,175381.65,2203649.50,20680.00,2189014.50


In [45]:
sales_combined = pd.concat(objs=[sales_2017,sales_2018,sales_2019])

In [46]:
sales_combined.head()

Unnamed: 0,Date,Voucher,Party,Product,Qty,Rate,Gross,Disc,Voucher Amount
0,1/4/2017,Sal:1,SOLANKI PLASTICS,DONA-VAI-9100,2,1690.0,3380.0,,13100.0
1,1/4/2017,Sal:1,SOLANKI PLASTICS,LITE FOAM(1200),6,1620.0,9720.0,,
2,1/4/2017,Sal:2,SARNESWARA TRADERS,VISHNU CHOTA WINE,500,23.0,11500.0,,30990.0
3,1/4/2017,Sal:2,SARNESWARA TRADERS,LITE FOAM(1200),6,1620.0,9720.0,,
4,1/4/2017,Sal:2,SARNESWARA TRADERS,DONA-VAI-9100,5,1690.0,8450.0,,


In [50]:
sales_combined.shape

(111206, 9)

#### MERGE FUNCTION

the merge() function is used to combine two DataFrames based on common columns or indexes, similar to SQL joins.

In [51]:
one = pd.DataFrame(
    {
        "id":[1,2,3,4,5,6,7,8,9,10],
        "name":["Chandru","Kadhirvel","Razeeth","Lokesh","Naveen","Boominath","karthik","Madhan","Kavin","Arunraj"],
        "role":["AI","Cloud","Business","Billionaire","IT Specialist","Junior Engineer","Electronics","Electrical Specialist","Income-TAX","Project Coordinator"]
    }
)

two = pd.DataFrame(
    {
        'id':[1,2,3,4,5,6,7,8,9,10],
        "address":["India","India","Russia","USA","UK","India","India","UAE","India","UAE"]
    }
)

In [52]:
one.shape

(10, 3)

In [53]:
two.shape

(10, 2)

In [56]:
pd.merge(one,two,on='id',how="inner")

Unnamed: 0,id,name,role,address
0,1,Chandru,AI,India
1,2,Kadhirvel,Cloud,India
2,3,Razeeth,Business,Russia
3,4,Lokesh,Billionaire,USA
4,5,Naveen,IT Specialist,UK
5,6,Boominath,Junior Engineer,India
6,7,karthik,Electronics,India
7,8,Madhan,Electrical Specialist,UAE
8,9,Kavin,Income-TAX,India
9,10,Arunraj,Project Coordinator,UAE


In [57]:
three = pd.merge(one,two,on='id',how="inner")

#### APPLY

apply() function is used to apply a custom function (built-in, user-defined, or lambda) to each element, row, or column of a DataFrame or Series


In [58]:
insurance_data.shape

(1338, 7)

In [61]:
insurance_data.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


In [67]:
def health(x):
    if x<24:
        return "Good Health"
    else:
        return "Bad Health"

In [68]:
insurance_data["Health Condition"] = insurance_data["bmi"].apply(health)

In [69]:
insurance_data.head(21)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Health Condition
0,19,female,27.9,0,yes,southwest,16884.924,Bad Health
1,18,male,33.77,1,no,southeast,1725.5523,Bad Health
2,28,male,33.0,3,no,southeast,4449.462,Bad Health
3,33,male,22.705,0,no,northwest,21984.47061,Good Health
4,32,male,28.88,0,no,northwest,3866.8552,Bad Health
5,31,female,25.74,0,no,southeast,3756.6216,Bad Health
6,46,female,33.44,1,no,southeast,8240.5896,Bad Health
7,37,female,27.74,3,no,northwest,7281.5056,Bad Health
8,37,male,29.83,2,no,northeast,6406.4107,Bad Health
9,60,female,25.84,0,no,northwest,28923.13692,Bad Health


In [70]:
insurance_data.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


In [72]:
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,Health Condition
0,19,female,27.900,0,yes,southwest,16884.92400,Bad Health
1,18,male,33.770,1,no,southeast,1725.55230,Bad Health
2,28,male,33.000,3,no,southeast,4449.46200,Bad Health
3,33,male,22.705,0,no,northwest,21984.47061,Good Health
4,32,male,28.880,0,no,northwest,3866.85520,Bad Health
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,Bad Health
1334,18,female,31.920,0,no,northeast,2205.98080,Bad Health
1335,18,female,36.850,0,no,southeast,1629.83350,Bad Health
1336,21,female,25.800,0,no,southwest,2007.94500,Bad Health


#### DELETE

In [73]:
del insurance_data["Health Condition"]

In [74]:
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


#### LAMBDA FUNCTION

In [77]:
lambda x: "Even" if x%2==0 else "Odd"

<function __main__.<lambda>(x)>

In [78]:
insurance_data

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.900,0,yes,southwest,16884.92400
1,18,male,33.770,1,no,southeast,1725.55230
2,28,male,33.000,3,no,southeast,4449.46200
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.880,0,no,northwest,3866.85520
...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830
1334,18,female,31.920,0,no,northeast,2205.98080
1335,18,female,36.850,0,no,southeast,1629.83350
1336,21,female,25.800,0,no,southwest,2007.94500


In [89]:
data = pd.DataFrame( {
    "Name": ["Arun", "Divya", "Kiran", "Meena", "Rahul"],
    "City": ["Chennai", "Mumbai", "Delhi", "Chennai", "Bangalore"],
    "Age": [25, 30, 28, 22, 35],
    "Revenue": [1000, 1500, 1200, 900, 2000],
    "Profit": [200, 300, 250, 150, 500]
})

data



Unnamed: 0,Name,City,Age,Revenue,Profit
0,Arun,Chennai,25,1000,200
1,Divya,Mumbai,30,1500,300
2,Kiran,Delhi,28,1200,250
3,Meena,Chennai,22,900,150
4,Rahul,Bangalore,35,2000,500


In [90]:
data.loc[data['City']=="Chennai"]

Unnamed: 0,Name,City,Age,Revenue,Profit
0,Arun,Chennai,25,1000,200
3,Meena,Chennai,22,900,150


In [93]:
data.loc[data["City"]=="Bangalore"]

Unnamed: 0,Name,City,Age,Revenue,Profit
4,Rahul,Bangalore,35,2000,500


In [94]:
data.loc[data["City"]=="Mumbai"]

Unnamed: 0,Name,City,Age,Revenue,Profit
1,Divya,Mumbai,30,1500,300


In [104]:
data.iloc[:]

Unnamed: 0,Name,City,Age,Revenue,Profit
0,Arun,Chennai,25,1000,200
1,Divya,Mumbai,30,1500,300
