In [2]:
import pandas as pd

In [3]:
df = pd.read_csv('insurance.csv')

In [4]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges
0,19,female,27.9,0,yes,southwest,16884.924
1,18,male,33.77,1,no,southeast,1725.5523
2,28,male,33.0,3,no,southeast,4449.462
3,33,male,22.705,0,no,northwest,21984.47061
4,32,male,28.88,0,no,northwest,3866.8552


##### (1) describe
##### describe (): it is a method used in the Pandas library that generates descriptive statistics of a dataset ranging from count, mean, std (standard deviation), min (minimum value of a particular row in the data set), 25% (25 percentile), 50% (50 percentile), 75% (75 percentile), and the max (maximum value).

In [5]:
df.describe()

Unnamed: 0,age,bmi,children,charges
count,1338.0,1338.0,1338.0,1338.0
mean,39.207025,30.663397,1.094918,13270.422265
std,14.04996,6.098187,1.205493,12110.011237
min,18.0,15.96,0.0,1121.8739
25%,27.0,26.29625,0.0,4740.28715
50%,39.0,30.4,1.0,9382.033
75%,51.0,34.69375,2.0,16639.912515
max,64.0,53.13,5.0,63770.42801


##### (2) assign: 
##### The Pandas assign() function is used to create new columns in a dataframe, usually based on calculations. The assign() function takes the name of the new column to create along with the value to assign, which can come from a calculation of existing dataframe columns or from a lambda function.

In [6]:
df = df.assign(new_smoker_col = lambda x: x['smoker'])
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,new_smoker_col
0,19,female,27.900,0,yes,southwest,16884.92400,yes
1,18,male,33.770,1,no,southeast,1725.55230,no
2,28,male,33.000,3,no,southeast,4449.46200,no
3,33,male,22.705,0,no,northwest,21984.47061,no
4,32,male,28.880,0,no,northwest,3866.85520,no
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,10600.54830,no
1334,18,female,31.920,0,no,northeast,2205.98080,no
1335,18,female,36.850,0,no,southeast,1629.83350,no
1336,21,female,25.800,0,no,southwest,2007.94500,no


In [7]:
new_smoker_val = {'no': 0 , 'yes': 1}
smoker_num = df.new_smoker_col.map(new_smoker_val)
df.new_smoker_col = smoker_num
df.head(3)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,new_smoker_col
0,19,female,27.9,0,yes,southwest,16884.924,1
1,18,male,33.77,1,no,southeast,1725.5523,0
2,28,male,33.0,3,no,southeast,4449.462,0


##### (3) df.groupby() 
##### The groupby method in pandas is used to group data by one or more criteria and perform operations on the resulting groups. For example, you can use groupby to calculate the mean, sum, or count of each group

In [8]:
df.groupby(['sex'])['new_smoker_col'].sum()

sex
female    115
male      159
Name: new_smoker_col, dtype: int64

In [9]:
# or you can use len to smokers for only female
len(df[(df['sex'] == 'male') & (df['new_smoker_col'] == 0)])

517

##### (4) loc():
##### You can use the df.loc() function in pandas to select multiple columns in a DataFrame by label.in this example we selected only non-smoker female for analysis. checking under the column "sex". you can only see female and checking under the column "new_smoker_col", you can only see non-smokers.

In [10]:
df_l = df.loc[((df['sex'] == 'female') & (df['new_smoker_col'] == 0))]
df_l.head(n=6)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,new_smoker_col
5,31,female,25.74,0,no,southeast,3756.6216,0
6,46,female,33.44,1,no,southeast,8240.5896,0
7,37,female,27.74,3,no,northwest,7281.5056,0
9,60,female,25.84,0,no,northwest,28923.13692,0
13,56,female,39.82,0,no,southeast,11090.7178,0
16,52,female,30.78,1,no,northeast,10797.3362,0


##### (5)sort_values() function:
##### the sort_values() function is used to sort a DataFrame based on specified columns.The most commonly used way of sorting a dataframe is to sort by a single column using  by parameter. This will sort the dataframe in ascending order by default. That means numeric data will be sorted from smallest to largest and string data will be sorted alphabetically. If you want to sort in descending order, you can use the ascending parameter and set it to False.

In [11]:
df.sort_values(by=["age",'charges'], ascending=True)

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,new_smoker_col
940,18,male,23.21,0,no,southeast,1121.87390,0
808,18,male,30.14,0,no,southeast,1131.50660,0
1244,18,male,33.33,0,no,southeast,1135.94070,0
663,18,male,33.66,0,no,southeast,1136.39940,0
22,18,male,34.10,0,no,southeast,1137.01100,0
...,...,...,...,...,...,...,...,...
62,64,male,24.70,1,no,northwest,30166.61817,0
420,64,male,33.88,0,yes,southeast,46889.26120,1
94,64,female,31.30,2,yes,southwest,47291.05500,1
328,64,female,33.80,1,yes,southwest,47928.03000,1


##### (6)df.apply(): 
##### pandas apply function makes use of a function created and apply the function created to a DataFrame.

In [12]:
df.head()

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,new_smoker_col
0,19,female,27.9,0,yes,southwest,16884.924,1
1,18,male,33.77,1,no,southeast,1725.5523,0
2,28,male,33.0,3,no,southeast,4449.462,0
3,33,male,22.705,0,no,northwest,21984.47061,0
4,32,male,28.88,0,no,northwest,3866.8552,0


In [13]:
# let's create a function to discount charges
def discount(x):
    if x > 0 :
        return x - 1000
    return x

In [14]:
df_2 = df['charges'].apply(discount)
df_2

0       15884.92400
1         725.55230
2        3449.46200
3       20984.47061
4        2866.85520
           ...     
1333     9600.54830
1334     1205.98080
1335      629.83350
1336     1007.94500
1337    28141.36030
Name: charges, Length: 1338, dtype: float64

In [15]:
df.charges = df_2
df

Unnamed: 0,age,sex,bmi,children,smoker,region,charges,new_smoker_col
0,19,female,27.900,0,yes,southwest,15884.92400,1
1,18,male,33.770,1,no,southeast,725.55230,0
2,28,male,33.000,3,no,southeast,3449.46200,0
3,33,male,22.705,0,no,northwest,20984.47061,0
4,32,male,28.880,0,no,northwest,2866.85520,0
...,...,...,...,...,...,...,...,...
1333,50,male,30.970,3,no,northwest,9600.54830,0
1334,18,female,31.920,0,no,northeast,1205.98080,0
1335,18,female,36.850,0,no,southeast,629.83350,0
1336,21,female,25.800,0,no,southwest,1007.94500,0


##### (7) df.merge():
##### the merge function in pandas , merges two DataFrame based on a specific column and return a new DataFrame that represents the result of the merge operation.
##### let's consider the dataset below.

In [38]:
raw_data_1 = pd.DataFrame({
        'subject_id': ['1', '2', '3', '4', '5'],
        'first_name': ['Alex', 'Amy', 'Allen', 'Alice', 'Ayoung'], 
        'last_name': ['Anderson', 'Ackerman', 'Ali', 'Aoni', 'Atiches']})

raw_data_2 = pd.DataFrame({
        'subject_id': ['1', '2', '3', '4', '5', '7', '8', '9', '10', '11'],
        'test_id': [51, 15, 15, 61, 16, 14, 15, 1, 61, 16]})

In [42]:
inner_merge = pd.merge(raw_data_1,  raw_data_2, on="subject_id", how="inner")
inner_merge

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,5,Ayoung,Atiches,16


In [43]:
left_merge = pd.merge(raw_data_1,  raw_data_2, on="subject_id", how="left")
left_merge

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,5,Ayoung,Atiches,16


In [44]:
right_merge = pd.merge(raw_data_1,  raw_data_2, on="subject_id", how="right")
right_merge

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,5,Ayoung,Atiches,16
5,7,,,14
6,8,,,15
7,9,,,1
8,10,,,61
9,11,,,16


In [45]:
outer_merge = pd.merge(raw_data_1,  raw_data_2, on="subject_id", how="outer")
outer_merge

Unnamed: 0,subject_id,first_name,last_name,test_id
0,1,Alex,Anderson,51
1,2,Amy,Ackerman,15
2,3,Allen,Ali,15
3,4,Alice,Aoni,61
4,5,Ayoung,Atiches,16
5,7,,,14
6,8,,,15
7,9,,,1
8,10,,,61
9,11,,,16
