In [29]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.model_selection import train_test_split

# Pipeline
from sklearn.pipeline import Pipeline
from sklearn.pipeline import make_pipeline
from sklearn.compose import ColumnTransformer

# Preprocessing
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import OneHotEncoder

# Models
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import RandomForestRegressor

# Evaluation
from sklearn.metrics import classification_report
from sklearn.metrics import plot_confusion_matrix
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_absolute_error
from sklearn.metrics import mean_squared_error


# Bank dataset - RFC

In [None]:
df = pd.read_csv('bank-full.csv',sep=';')

In [None]:
df.head()

In [None]:
df.describe()

In [None]:
df.isnull().sum()

In [None]:
df.info()

### Perform EDA on bank dataset

In [None]:
#correlation
corr = df.corr()
corr.style.background_gradient(cmap='coolwarm')

In [None]:
#Histogram chart
plt.figure(figsize=(15,8))
sns.histplot(data=df, x='y')
plt.title("clients subscribed a term deposit")
plt.ylabel("num of clients")
plt.xlabel("bank term deposit")

In [None]:
#pairplot
sns.pairplot(df, hue='housing')

### Random Forest Classifer 

In [None]:
## ML study being conducted --> Classification 

target = 'y'
# feature set --> it cannot have the target 
X = df.drop(target, axis=1)
# target set 
y = df[target]

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8, random_state=20)

In [None]:
# return only numeric columns names

numeric_features = X_train.describe().columns 

numeric_features

In [None]:
# Return only categorical names

categorical_features = X_train.describe(exclude="number").columns

categorical_features

In [None]:
# Create a transformer for numeric columns

numeric_transformer = Pipeline(
    steps=[
        # missing values --> by default mean #we do not have missing values in this case but we will keep this line
        ('imputer', SimpleImputer()),
        ('scaler', StandardScaler())
    ]
)

# Create Transformer for categorical data

categorical_transformer = Pipeline(
    steps=[
        # most_frequent --> mode
        ('cat_imputer', SimpleImputer(strategy="most_frequent")),
        ('one_hot', OneHotEncoder(handle_unknown='ignore')) # Ignore unseen categorical in transform step not seen in fit_transform
    ]
)

# Create a preprocessor transformer
preprocessor = ColumnTransformer(
    transformers=[
        ('num', numeric_transformer, numeric_features),
        ('cat', categorical_transformer, categorical_features)
    ]
)

# Append classifier to preprocessing pipeline.
# Now we have a full prediction pipeline.
clf = Pipeline(
    steps=[
        ('preprocessor', preprocessor),
        ('classifier', RandomForestClassifier(n_estimators = 20, criterion = 'entropy', random_state = 0))
    ]
)

clf.fit(X_train, y_train)
print(f"basline : {df['y'].value_counts(normalize=True)}")
print(f"Train score : {clf.score(X_train, y_train)}")
print(f"Test score : {clf.score(X_test, y_test)}")

In [None]:
#simple confusion matrix
plot_confusion_matrix(clf, X_test, y_test);

In [None]:
# Printing the classification report
y_pred = clf.predict(X_test)
print(classification_report(y_test, y_pred))

# Pullotion dataset - RFR

In [2]:
df1= pd.read_csv('pollution_us_2000_2016.csv')

In [3]:
df1.head()

Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,Address,State,County,City,Date Local,NO2 Units,...,SO2 Units,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Units,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
0,0,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,3.0,9.0,21,13.0,Parts per million,1.145833,4.2,21,
1,1,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,3.0,9.0,21,13.0,Parts per million,0.878947,2.2,23,25.0
2,2,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,2.975,6.6,23,,Parts per million,1.145833,4.2,21,
3,3,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-01,Parts per billion,...,Parts per billion,2.975,6.6,23,,Parts per million,0.878947,2.2,23,25.0
4,4,4,13,3002,1645 E ROOSEVELT ST-CENTRAL PHOENIX STN,Arizona,Maricopa,Phoenix,2000-01-02,Parts per billion,...,Parts per billion,1.958333,3.0,22,4.0,Parts per million,0.85,1.6,23,


In [4]:
#there are null values
df1.isnull().sum()

Unnamed: 0                0
State Code                0
County Code               0
Site Num                  0
Address                   0
State                     0
County                    0
City                      0
Date Local                0
NO2 Units                 0
NO2 Mean                  0
NO2 1st Max Value         0
NO2 1st Max Hour          0
NO2 AQI                   0
O3 Units                  0
O3 Mean                   0
O3 1st Max Value          0
O3 1st Max Hour           0
O3 AQI                    0
SO2 Units                 0
SO2 Mean                  0
SO2 1st Max Value         0
SO2 1st Max Hour          0
SO2 AQI              872907
CO Units                  0
CO Mean                   0
CO 1st Max Value          0
CO 1st Max Hour           0
CO AQI               873323
dtype: int64

### Perform EDA on bank dataset

In [5]:
#correlation
#strog correlation -->CO AQI ( CO Mean, CO 1st Max Value,NO2 Mean,NO2 1st Max Value,NO2 AQI )
corr = df1.corr()
corr.style.background_gradient(cmap='coolwarm')

Unnamed: 0.1,Unnamed: 0,State Code,County Code,Site Num,NO2 Mean,NO2 1st Max Value,NO2 1st Max Hour,NO2 AQI,O3 Mean,O3 1st Max Value,O3 1st Max Hour,O3 AQI,SO2 Mean,SO2 1st Max Value,SO2 1st Max Hour,SO2 AQI,CO Mean,CO 1st Max Value,CO 1st Max Hour,CO AQI
Unnamed: 0,1.0,0.900468,0.314373,-0.250897,-0.178741,-0.15794,-0.012439,-0.159556,0.061252,0.016373,8.1e-05,0.01708,0.082965,0.101342,0.066659,0.10471,-0.275512,-0.239109,-0.016264,-0.258143
State Code,0.900468,1.0,0.269284,-0.245268,-0.095675,-0.069766,0.017757,-0.070765,0.04311,0.036221,0.000902,0.033527,0.175411,0.176024,0.088107,0.183895,-0.220182,-0.170506,0.001214,-0.189588
County Code,0.314373,0.269284,1.0,-0.143841,-0.092672,-0.071285,-0.001948,-0.072793,0.054692,0.048233,-0.007364,0.040689,-0.008233,0.029848,0.00511,0.036789,-0.097669,-0.084075,-0.010644,-0.089075
Site Num,-0.250897,-0.245268,-0.143841,1.0,0.070851,0.072546,-0.014032,0.073608,0.043902,0.090318,0.00287,0.083351,-0.081302,-0.0754,-0.0541,-0.07601,0.057952,0.065934,-0.018481,0.066155
NO2 Mean,-0.178741,-0.095675,-0.092672,0.070851,1.0,0.904383,0.11254,0.905418,-0.43265,-0.151998,0.027303,-0.082361,0.348186,0.281954,0.117438,0.295347,0.641828,0.63902,0.186196,0.661031
NO2 1st Max Value,-0.15794,-0.069766,-0.071285,0.072546,0.904383,1.0,0.147918,0.997859,-0.291832,0.009866,0.022406,0.048743,0.301599,0.262012,0.13104,0.277928,0.563316,0.60658,0.23143,0.61155
NO2 1st Max Hour,-0.012439,0.017757,-0.001948,-0.014032,0.11254,0.147918,1.0,0.147136,-0.203151,-0.090507,-0.125653,-0.086782,0.027148,0.021742,0.173731,0.021074,0.060253,0.079613,0.381236,0.057663
NO2 AQI,-0.159556,-0.070765,-0.072793,0.073608,0.905418,0.997859,0.147136,1.0,-0.29172,0.01032,0.022352,0.049027,0.305201,0.26507,0.129903,0.281377,0.566264,0.609375,0.230857,0.614445
O3 Mean,0.061252,0.04311,0.054692,0.043902,-0.43265,-0.291832,-0.203151,-0.29172,1.0,0.860622,0.045752,0.768771,-0.110401,-0.07247,-0.083162,-0.070859,-0.339426,-0.343399,-0.19927,-0.355314
O3 1st Max Value,0.016373,0.036221,0.048233,0.090318,-0.151998,0.009866,-0.090507,0.01032,0.860622,1.0,0.055518,0.931867,-0.016075,0.025376,-0.015759,0.034514,-0.182691,-0.164323,-0.097012,-0.179301


### Random Forest Regressor 

In [30]:
## ML study being conducted --> Classification 
var=['CO Mean', 'CO 1st Max Value','NO2 Mean','NO2 1st Max Value','NO2 AQI']
target = 'CO AQI'
# feature set --> it cannot have the target 
X = df1[var]
# target set 
y = df1[target]

X_train, X_test, y_train, y_test = train_test_split(X,y,train_size=0.8, random_state=20)

In [31]:
# return only numeric columns names

numeric_features = X_train.describe().columns 

numeric_features

Index(['CO Mean', 'CO 1st Max Value', 'NO2 Mean', 'NO2 1st Max Value',
       'NO2 AQI'],
      dtype='object')

In [32]:
print(f"basline : {df1['CO AQI'].value_counts(normalize=True)}")

basline : 5.996595      0.499996
3.000000      0.083547
2.000000      0.081603
5.000000      0.065727
6.000000      0.050057
                ...   
113.000000    0.000001
101.000000    0.000001
150.000000    0.000001
159.000000    0.000001
98.000000     0.000001
Name: CO AQI, Length: 108, dtype: float64


In [33]:
#solve missing value 
df1['CO AQI'].fillna(np.mean(df1['CO AQI']),inplace = True)
df1['SO2 AQI'].fillna(np.mean(df1['SO2 AQI']),inplace = True)

In [34]:
df1.isnull().sum()

Unnamed: 0           0
State Code           0
County Code          0
Site Num             0
Address              0
State                0
County               0
City                 0
Date Local           0
NO2 Units            0
NO2 Mean             0
NO2 1st Max Value    0
NO2 1st Max Hour     0
NO2 AQI              0
O3 Units             0
O3 Mean              0
O3 1st Max Value     0
O3 1st Max Hour      0
O3 AQI               0
SO2 Units            0
SO2 Mean             0
SO2 1st Max Value    0
SO2 1st Max Hour     0
SO2 AQI              0
CO Units             0
CO Mean              0
CO 1st Max Value     0
CO 1st Max Hour      0
CO AQI               0
dtype: int64

In [37]:
reg_forest = RandomForestRegressor(n_estimators = 20, random_state = 0, criterion = 'mse')

reg_forest.fit(X_train, y_train)

preds_forest = reg_forest.predict(X_test)


mean_absolute_error(y_true=y_test, y_pred=preds_forest)


0.6738316834002206

In [39]:
mean_squared_error(y_true=y_test, y_pred=preds_forest)

2.8125706186307067

In [41]:
print(f"Train score : {reg_forest.score(X_train, y_train)}")
print(f"Test score : {reg_forest.score(X_test, y_test)}")

Train score : 0.9547226503754325
Test score : 0.8335440168698215


In [43]:
reg_forest.feature_importances_

array([0.5140949 , 0.23185621, 0.15625399, 0.05983163, 0.03796328])