#OPPE REVISION SESSION
July 18, 2023

In [None]:
from sklearn import model_selection, preprocessing

In [None]:
dir(model_selection)

##### Import Basic Libraries and Data Understanding


In [None]:
import pandas as pd
import numpy as np

In [None]:
df = pd.read_csv('/content/dataset.csv')

In [None]:
df.head()

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,27529,Male,Urban,High School,42.99,0,0,3849.0,1,A,Platinum,66816
1,27116,Male,Rural,Bachelor,5.33,1,6,3006.0,More than 1,A,Gold,67164
2,6499,Female,Urban,High School,2.26,1,2,,More than 1,A,Platinum,68076
3,61863,Male,Rural,High School,20.29,1,8,2844.0,More than 1,A,Platinum,63276
4,25045,Female,Urban,High School,5.63,0,6,6370.0,More than 1,A,Platinum,245844


In [None]:
label = df['cltv']
features = df.drop('cltv', axis = 1)

In [None]:
df.shape

(6257, 12)

##### Splitting into train and test sets


In [None]:
from sklearn.model_selection import train_test_split

In [None]:
X_train, X_test, y_train, y_test = train_test_split(features, label, random_state = 28, test_size = 0.2)

In [None]:
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((5005, 11), (1252, 11), (5005,), (1252,))

##### Handling Missing Values

In [None]:
df.isnull().sum()

id                  0
gender              0
area              384
qualification       0
income            401
marital_status      0
vintage             0
claim_amount      360
num_policies        0
policy            372
type_of_policy      0
cltv                0
dtype: int64

In [None]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6257 entries, 0 to 6256
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              6257 non-null   int64  
 1   gender          6257 non-null   object 
 2   area            5873 non-null   object 
 3   qualification   6257 non-null   object 
 4   income          5856 non-null   float64
 5   marital_status  6257 non-null   int64  
 6   vintage         6257 non-null   int64  
 7   claim_amount    5897 non-null   float64
 8   num_policies    6257 non-null   object 
 9   policy          5885 non-null   object 
 10  type_of_policy  6257 non-null   object 
 11  cltv            6257 non-null   int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 586.7+ KB


In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer

In [None]:
trans = [
    ('mf', SimpleImputer(strategy = 'most_frequent'), ['area', 'policy']),
    ('mean', SimpleImputer(strategy = 'mean'), ['income']),
    ('median', SimpleImputer(strategy = 'median'), [7])
]

ct = ColumnTransformer( transformers = trans, remainder = 'passthrough', verbose_feature_names_out = False)

In [None]:
ct.transformers

[('mf', SimpleImputer(strategy='most_frequent'), ['area', 'policy']),
 ('mean', SimpleImputer(), ['income']),
 ('median', SimpleImputer(strategy='median'), [7])]

In [None]:
ct.transformers[0]

('mf', SimpleImputer(strategy='most_frequent'), ['area', 'policy'])

In [None]:
#ct.transformers[0][1].statistics

In [None]:
X_train_clean = pd.DataFrame(ct.fit_transform(X_train), columns = ct.get_feature_names_out())
X_train_clean

In [None]:
X_test_clean = pd.DataFrame(ct.transform(X_test), columns = ct.get_feature_names_out())
X_test_clean

In [None]:
X_train_clean.isnull().sum()

area              0
policy            0
income            0
claim_amount      0
id                0
gender            0
qualification     0
marital_status    0
vintage           0
num_policies      0
type_of_policy    0
dtype: int64

##### OneHotEncoder , StandardScaler

In [None]:
from sklearn.preprocessing import OneHotEncoder, StandardScaler

In [None]:
X_train_clean.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 5005 entries, 0 to 5004
Data columns (total 11 columns):
 #   Column          Non-Null Count  Dtype 
---  ------          --------------  ----- 
 0   area            5005 non-null   object
 1   policy          5005 non-null   object
 2   income          5005 non-null   object
 3   claim_amount    5005 non-null   object
 4   id              5005 non-null   object
 5   gender          5005 non-null   object
 6   qualification   5005 non-null   object
 7   marital_status  5005 non-null   object
 8   vintage         5005 non-null   object
 9   num_policies    5005 non-null   object
 10  type_of_policy  5005 non-null   object
dtypes: object(11)
memory usage: 430.2+ KB


In [None]:
df.select_dtypes(exclude = 'number').columns

Index(['gender', 'area', 'qualification', 'num_policies', 'policy',
       'type_of_policy'],
      dtype='object')

In [None]:
df.select_dtypes('number').columns

Index(['id', 'income', 'marital_status', 'vintage', 'claim_amount', 'cltv'], dtype='object')

In [None]:
trans2 = [
    ('ohe', OneHotEncoder(), ['gender', 'area', 'qualification', 'num_policies', 'policy', 'type_of_policy']),
    ('ss', StandardScaler(), ['income', 'marital_status', 'vintage', 'claim_amount'])
]

ct2 = ColumnTransformer(transformers = trans2)

In [None]:
ct2.fit_transform(X_train_clean)

In [None]:
ct2.transform(X_test_clean)

##### Model Fitting: Linear Regression

#  **Week-1 Live Session-1**

* alt + enter ( run cell and create new cell and take cursor to next cell )

* ctrl + enter ( run cell )

* shift + enter ( run cell and take cursor to next cell)

To get the info about particular module

* ctrl + shift + space

* ctrl + space

##### Importing Basic Libraries

In [None]:
# Importing Basic Libraries

import pandas as pd
import numpy as np

##### Loading Dataset (reading csv file)

In [None]:
# loading dataset

data = pd.read_csv('/content/dataset.csv')

In [None]:
# checking the number of rows and columns of data

data.shape

(6257, 12)

In [None]:
# to check initial samples of the data (by default = 5 samples)

data.head()

Unnamed: 0,id,gender,area,qualification,income,marital_status,vintage,claim_amount,num_policies,policy,type_of_policy,cltv
0,27529,Male,Urban,High School,42.99,0,0,3849.0,1,A,Platinum,66816
1,27116,Male,Rural,Bachelor,5.33,1,6,3006.0,More than 1,A,Gold,67164
2,6499,Female,Urban,High School,2.26,1,2,,More than 1,A,Platinum,68076
3,61863,Male,Rural,High School,20.29,1,8,2844.0,More than 1,A,Platinum,63276
4,25045,Female,Urban,High School,5.63,0,6,6370.0,More than 1,A,Platinum,245844


In [None]:
# information about the DataFrame

data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 6257 entries, 0 to 6256
Data columns (total 12 columns):
 #   Column          Non-Null Count  Dtype  
---  ------          --------------  -----  
 0   id              6257 non-null   int64  
 1   gender          6257 non-null   object 
 2   area            5873 non-null   object 
 3   qualification   6257 non-null   object 
 4   income          5856 non-null   float64
 5   marital_status  6257 non-null   int64  
 6   vintage         6257 non-null   int64  
 7   claim_amount    5897 non-null   float64
 8   num_policies    6257 non-null   object 
 9   policy          5885 non-null   object 
 10  type_of_policy  6257 non-null   object 
 11  cltv            6257 non-null   int64  
dtypes: float64(2), int64(4), object(6)
memory usage: 586.7+ KB


In [None]:
data['gender'].value_counts()

Male      3569
Female    2688
Name: gender, dtype: int64

In [None]:
# to count the number of null values

data.isnull().sum()

id                  0
gender              0
area              384
qualification       0
income            401
marital_status      0
vintage             0
claim_amount      360
num_policies        0
policy            372
type_of_policy      0
cltv                0
dtype: int64

In [None]:
# accessing particular row and columns

data.iloc[:3, 2] # index of rows , column index

0    Urban
1    Rural
2    Urban
Name: area, dtype: object

In [None]:
data.loc[:4, "gender"] # row index , column name

0      Male
1      Male
2    Female
3      Male
4    Female
Name: gender, dtype: object

In [None]:
# to drop/remove columns or rows from data

features = data.drop(['cltv'], axis = 1)

In [None]:
label = data['cltv']

In [None]:
# putting conditions

data[data['income'] > 60]

In [None]:
# multiple conditions --> and & , or |

data[data['income'] > 60][data['gender'] == 'Male'] ### Same as 'and'

#  **Week-1 Live Session-2**

In [None]:
import pandas as pd
import numpy as np

In [None]:
from sklearn.datasets import load_iris

In [None]:
iris = load_iris(as_frame = True)

##### UNDERSTANDING DATA

In [None]:
iris.frame.shape

(150, 5)

In [None]:
iris.keys()

dict_keys(['data', 'target', 'frame', 'target_names', 'DESCR', 'feature_names', 'filename', 'data_module'])

In [None]:
dir(iris)

['DESCR',
 'data',
 'data_module',
 'feature_names',
 'filename',
 'frame',
 'target',
 'target_names']

In [None]:
X = iris.data # features
y = iris.target # label

In [None]:
iris.target_names

array(['setosa', 'versicolor', 'virginica'], dtype='<U10')

In [None]:
df = iris.frame

In [None]:
df.index

RangeIndex(start=0, stop=150, step=1)

In [None]:
df.describe()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
count,150.0,150.0,150.0,150.0,150.0
mean,5.843333,3.057333,3.758,1.199333,1.0
std,0.828066,0.435866,1.765298,0.762238,0.819232
min,4.3,2.0,1.0,0.1,0.0
25%,5.1,2.8,1.6,0.3,0.0
50%,5.8,3.0,4.35,1.3,1.0
75%,6.4,3.3,5.1,1.8,2.0
max,7.9,4.4,6.9,2.5,2.0


In [None]:
df.head()

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
0,5.1,3.5,1.4,0.2,0
1,4.9,3.0,1.4,0.2,0
2,4.7,3.2,1.3,0.2,0
3,4.6,3.1,1.5,0.2,0
4,5.0,3.6,1.4,0.2,0


In [None]:
df.columns

Index(['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)',
       'petal width (cm)', 'target'],
      dtype='object')

##### SORTING THE DATA

In [None]:
# Sort Column Names in Descending Order

df.sort_index(axis = 1, ascending = False)

Unnamed: 0,target,sepal width (cm),sepal length (cm),petal width (cm),petal length (cm)
0,0,3.5,5.1,0.2,1.4
1,0,3.0,4.9,0.2,1.4
2,0,3.2,4.7,0.2,1.3
3,0,3.1,4.6,0.2,1.5
4,0,3.6,5.0,0.2,1.4
...,...,...,...,...,...
145,2,3.0,6.7,2.3,5.2
146,2,2.5,6.3,1.9,5.0
147,2,3.0,6.5,2.0,5.2
148,2,3.4,6.2,2.3,5.4


In [None]:
# Sorting by values of particular column in Ascending Order

df.sort_values(by = 'sepal width (cm)')

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
60,5.0,2.0,3.5,1.0,1
62,6.0,2.2,4.0,1.0,1
119,6.0,2.2,5.0,1.5,2
68,6.2,2.2,4.5,1.5,1
41,4.5,2.3,1.3,0.3,0
...,...,...,...,...,...
16,5.4,3.9,1.3,0.4,0
14,5.8,4.0,1.2,0.2,0
32,5.2,4.1,1.5,0.1,0
33,5.5,4.2,1.4,0.2,0


In [None]:
# selecting particular rows

df[20:23]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
20,5.4,3.4,1.7,0.2,0
21,5.1,3.7,1.5,0.4,0
22,4.6,3.6,1.0,0.2,0


In [None]:
y

0      0
1      0
2      0
3      0
4      0
      ..
145    2
146    2
147    2
148    2
149    2
Name: target, Length: 150, dtype: int64

In [None]:
X

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm)
0,5.1,3.5,1.4,0.2
1,4.9,3.0,1.4,0.2
2,4.7,3.2,1.3,0.2
3,4.6,3.1,1.5,0.2
4,5.0,3.6,1.4,0.2
...,...,...,...,...
145,6.7,3.0,5.2,2.3
146,6.3,2.5,5.0,1.9
147,6.5,3.0,5.2,2.0
148,6.2,3.4,5.4,2.3


##### STATS OF DATA

In [None]:
df['sepal width (cm)'].min()

2.0

In [None]:
df['sepal width (cm)'].std()

0.4358662849366982

In [None]:
df['sepal width (cm)'].mean()

3.0573333333333337

In [None]:
df['sepal width (cm)'].var()

0.189979418344519

In [None]:
df['sepal width (cm)'].mode()

0    3.0
Name: sepal width (cm), dtype: float64

In [None]:
df['sepal width (cm)'].median()

3.0

In [None]:
df['sepal width (cm)'].max()

4.4

##### SEARCHING

In [None]:
df[df['sepal width (cm)'] == 4.4]

Unnamed: 0,sepal length (cm),sepal width (cm),petal length (cm),petal width (cm),target
15,5.7,4.4,1.5,0.4,0


In [None]:
df[['sepal width (cm)', 'sepal length (cm)']][df['sepal width (cm)'] == 4.4][df['sepal length (cm)'] == 5.7]

  df[['sepal width (cm)', 'sepal length (cm)']][df['sepal width (cm)'] == 4.4][df['sepal length (cm)'] == 5.7]


Unnamed: 0,sepal width (cm),sepal length (cm)
15,4.4,5.7


##### SET AND RESET INDEX

In [None]:
df1 = df.set_index('sepal width (cm)')

In [None]:
df1.head()

Unnamed: 0_level_0,sepal length (cm),petal length (cm),petal width (cm),target
sepal width (cm),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3.5,5.1,1.4,0.2,0
3.0,4.9,1.4,0.2,0
3.2,4.7,1.3,0.2,0
3.1,4.6,1.5,0.2,0
3.6,5.0,1.4,0.2,0


In [None]:
df1.loc[3.5]

Unnamed: 0_level_0,sepal length (cm),petal length (cm),petal width (cm),target
sepal width (cm),Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
3.5,5.1,1.4,0.2,0
3.5,5.1,1.4,0.3,0
3.5,5.2,1.5,0.2,0
3.5,5.5,1.3,0.2,0
3.5,5.0,1.3,0.3,0
3.5,5.0,1.6,0.6,0


In [None]:
df1.reset_index()

Unnamed: 0,sepal width (cm),sepal length (cm),petal length (cm),petal width (cm),target
0,3.5,5.1,1.4,0.2,0
1,3.0,4.9,1.4,0.2,0
2,3.2,4.7,1.3,0.2,0
3,3.1,4.6,1.5,0.2,0
4,3.6,5.0,1.4,0.2,0
...,...,...,...,...,...
145,3.0,6.7,5.2,2.3,2
146,2.5,6.3,5.0,1.9,2
147,3.0,6.5,5.2,2.0,2
148,3.4,6.2,5.4,2.3,2


##### REPLACING NULL VALUES

In [None]:
data2 = pd.read_csv('dataset.csv')

In [None]:
data2.isnull().sum()

id                  0
gender              0
area              384
qualification       0
income            401
marital_status      0
vintage             0
claim_amount      360
num_policies        0
policy            372
type_of_policy      0
cltv                0
dtype: int64

In [None]:
#if null values are not in the default format i.e np.NaN, and we know what kind of null values are there, we can do the following

na_dict = {}

data2 = pd.read_csv('dataset.csv', na_values = na_dict)

In [None]:
# to fill missing values with some data

data2.fillna(method = 'ffill')  # method = ['bfill', 'ffill']

In [None]:
# fill NaN values using interpolation method

data2['income'].interpolate()

In [None]:
# removing missing values

data2.dropna()

In [None]:
# if there is sample having all features with NaN values, we can drop it by following --> it will drop rows

data2.dropna(how = 'all')

In [None]:
# at least 2 values should be there in a row to NOT DROPOUT

data2. dropna(thresh = 2)

##### GROUP BY

In [None]:
import seaborn as sns

In [None]:
newdata = sns.load_dataset('titanic')

In [None]:
newdata.head()

Unnamed: 0,survived,pclass,sex,age,sibsp,parch,fare,embarked,class,who,adult_male,deck,embark_town,alive,alone
0,0,3,male,22.0,1,0,7.25,S,Third,man,True,,Southampton,no,False
1,1,1,female,38.0,1,0,71.2833,C,First,woman,False,C,Cherbourg,yes,False
2,1,3,female,26.0,0,0,7.925,S,Third,woman,False,,Southampton,yes,True
3,1,1,female,35.0,1,0,53.1,S,First,woman,False,C,Southampton,yes,False
4,0,3,male,35.0,0,0,8.05,S,Third,man,True,,Southampton,no,True


In [None]:
d = newdata[['survived', 'class', 'sex', 'fare', 'embark_town']]

In [None]:
d

Unnamed: 0,survived,class,sex,fare,embark_town
0,0,Third,male,7.2500,Southampton
1,1,First,female,71.2833,Cherbourg
2,1,Third,female,7.9250,Southampton
3,1,First,female,53.1000,Southampton
4,0,Third,male,8.0500,Southampton
...,...,...,...,...,...
886,0,Second,male,13.0000,Southampton
887,1,First,female,30.0000,Southampton
888,0,Third,female,23.4500,Southampton
889,1,First,male,30.0000,Cherbourg


In [None]:
d['class'].unique()

['Third', 'First', 'Second']
Categories (3, object): ['First', 'Second', 'Third']

In [None]:
grp = d.groupby('class')

In [None]:
grp

<pandas.core.groupby.generic.DataFrameGroupBy object at 0x787402366380>

In [None]:
for Class, class_df in grp:
  print(Class)
  print(class_df)

First
     survived  class     sex     fare  embark_town
1           1  First  female  71.2833    Cherbourg
3           1  First  female  53.1000  Southampton
6           0  First    male  51.8625  Southampton
11          1  First  female  26.5500  Southampton
23          1  First    male  35.5000  Southampton
..        ...    ...     ...      ...          ...
871         1  First  female  52.5542  Southampton
872         0  First    male   5.0000  Southampton
879         1  First  female  83.1583    Cherbourg
887         1  First  female  30.0000  Southampton
889         1  First    male  30.0000    Cherbourg

[216 rows x 5 columns]
Second
     survived   class     sex     fare  embark_town
9           1  Second  female  30.0708    Cherbourg
15          1  Second  female  16.0000  Southampton
17          1  Second    male  13.0000  Southampton
20          0  Second    male  26.0000  Southampton
21          1  Second    male  13.0000  Southampton
..        ...     ...     ...      ... 

In [None]:
dFirst = grp.get_group('First')
dFirst

Unnamed: 0,survived,class,sex,fare,embark_town
1,1,First,female,71.2833,Cherbourg
3,1,First,female,53.1000,Southampton
6,0,First,male,51.8625,Southampton
11,1,First,female,26.5500,Southampton
23,1,First,male,35.5000,Southampton
...,...,...,...,...,...
871,1,First,female,52.5542,Southampton
872,0,First,male,5.0000,Southampton
879,1,First,female,83.1583,Cherbourg
887,1,First,female,30.0000,Southampton


In [None]:
grp['fare'].max()

class
First     512.3292
Second     73.5000
Third      69.5500
Name: fare, dtype: float64

##### CONCATENATION

##### PIVOT TABLE