## Libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

from sklearn.preprocessing import StandardScaler, OneHotEncoder, OrdinalEncoder, MinMaxScaler, LabelEncoder

from statsmodels.stats.outliers_influence import variance_inflation_factor

import statsmodels.api as sm
from imblearn.over_sampling import SMOTE
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier, plot_tree
from sklearn.svm import SVC
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, f1_score, recall_score, precision_score, mean_squared_error


# Pre-processing

In [None]:
df=pd.read_csv('/content/file.csv')
df= pd.read_excel('/content/file.xlsx')

In [None]:
df.shape

In [None]:
df.head()

In [None]:
df.isnull().any(axis = 1).sum()

In [None]:
nan_columns = df.columns[df.isnull().any()].tolist()
nan_counts = {column: df[column].isnull().sum() for column in nan_columns}
print(nan_counts)

In [None]:
df.dropna(inplace=True)

In [None]:
df.fillna(0, inplace=True)

In [None]:
for i in df.index:
    if pd.isna(df.loc[i, 'variable']):
        current_mean = bikes_df['variable'].mean()
        df.loc[i, 'variable'] = current_mean

In [None]:
df.duplicated().sum()

In [None]:
df.drop_duplicates(inplace=True)

In [None]:
df.info()

In [None]:
df['date'] = pd.to_datetime(df['date'])

In [None]:
df['variable'] = df['variable'].astype(int)

In [None]:
df.to_csv('cleaned_dataset.csv', index=False)

In [None]:
df_copy = df.copy(deep=True)

# Exploration

In [None]:
df['y'].value_counts()

In [None]:
df.describe()

In [None]:
df.describe(include=['O']) #for categorical

In [None]:
df.skew()

In [None]:
df.kurtosis()

In [None]:
sns.histplot(bank_df['y'])

In [None]:
boxplot = df.boxplot(column=['x1', 'x2'])

In [None]:
df.plot.scatter(x='x',
                      y='y')

In [None]:
df.corr()['y'].abs().sort_values(ascending = False)

In [None]:
plt.figure(figsize=(20,8))
sns.heatmap(df.corr())

In [None]:
correlation_matrix=df.corr(method='pearson')
min_corr_threshold = 0.3
filtered_correlation_matrix = correlation_matrix[
    (correlation_matrix >= min_corr_threshold) | (correlation_matrix <= -min_corr_threshold)
]
correlation_matrix

In [None]:
plt.figure(figsize=(20, 8))
sns.heatmap(filtered_correlation_matrix, annot=True, cmap='coolwarm', linewidths=.5)
plt.title('Medium to Strong Correlation Matrix - 0.3+')
plt.show()

In [None]:
x_y_count = df.groupby(['variable', 'y']).size().unstack(fill_value=0)
x_y_count.plot(kind='bar', stacked=True)
plt.title('Y by X')
plt.xlabel('X')
plt.ylabel('Y Count')
plt.xticks(rotation=45)
plt.legend(title='Y')
plt.tight_layout()
plt.show()

In [None]:
df.plot.scatter(x='variable_1',
                      y='variable_2',
                      c='y',
                      colormap='viridis')

# Data Transformation

In [None]:
numerical_data = df.select_dtypes(include=['float64','int64'])
ind_num = numerical_data.columns.to_list()
object_data=df.select_dtypes(include= [object])
ind_obj = object_data.columns.to_list()
is_cols_df=[col for col in df.columns if col.startswith("is") and col!="is_claim"]

In [None]:
#encode dichotomous
yes_no_columns = df.columns[bank_df.isin(['yes', 'no']).any()]
df[yes_no_columns] = df[yes_no_columns].replace({'yes': 1, 'no': 0})

In [None]:
#encode categorical
label_encoder = LabelEncoder() #ordinal variables
df['variable_encoded'] = label_encoder.fit_transform(df['variable'])

#nominal variables
cat_df = df[['x1','x2','x3','x4','x5','x6']]
dummy_df = pd.get_dummies(cat_df, drop_first=True)
df = df.drop(columns=cat_df.columns)
df = pd.concat([df, dummy_df], axis=1)

In [None]:
#normalize
scaler = StandardScaler() #data follows normal dist
df_standardized = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

#data isn't norm dist
scaler = MinMaxScaler()
df_scaled = pd.DataFrame(scaler.fit_transform(df), columns=df.columns)

In [None]:
#combining columns
df['new_column'] = df[['x1','x2','x3']].sum(axis=1) #where column x1...xn have numerical values, typically, 0 or 1

In [None]:
Q1 = df['variable'].quantile(0.25)
Q3 = df["variable"].quantile(0.75)
IQR = Q3 - Q1

lower_bound = Q1 - 1.5 * IQR
upper_bound = Q3 + 1.5 * IQR

# Filter out outliers
df = df[(df["variable"] >= lower_bound) & (df["variable"] <= upper_bound)]
df.shape

In [None]:
#denormalise data
max= 70
min = 18
df['dNorm_variable'] = (train['variable'] * (max - min) + min).apply(lambda x: math.ceil(x))
train['dNorm_variable'].describe()

# Feature and Model Selection

In [None]:
vif_data = pd.DataFrame()
vif_data["feature"] = df.columns
vif_data["VIF"] = [variance_inflation_factor(df.values, i) for i in range(len(df.columns))]
vif_data['VIF'] = vif_data['VIF'].round()
vif_data = vif_data.sort_values(by='VIF', ascending=False)
vif_data

In [None]:
vif_data['VIF'].plot(kind='hist', bins=20, title='VIF')
plt.gca().spines[['top', 'right',]].set_visible(False)

In [None]:
#drop columns with VIF of greater than 5
high_vif_columns = vif_data[vif_data['VIF'] >= 5]['feature'].tolist()
df = df.drop(columns=high_vif_columns)
high_vif_columns

#### Sampling Data for Training & Testing

In [None]:
X = df.drop(columns = 'y', axis=1)
y = df['y']
X_train, X_test, y_train, y_test = train_test_split(X, y,test_size=0.2,stratify=y,random_state=333)

In [None]:
y_train.value_counts()

In [None]:
y_test.value_counts()

In [None]:
#apply oversampling with SMOTE()
smote =  SMOTE()
X_train, y_train =  smote.fit_resample(X_train,y_train)

In [None]:
simple_reg = sm.GLM(y_train, sm.add_constant(X_train), family = sm.families.Binomial()).fit()
simple_reg.summary()

In [None]:
threshold = 0.5

probab_test = simple_reg.predict(sm.add_constant(X_test))
pred_test = np.where(probab_test >= threshold, 1, 0)
perf_metrics(y_test, pred_test)

In [None]:
summary_table = lr_results.summary()

coefficients = summary_table.tables[1].data[1:]
variable_names = [row[0] for row in coefficients]
z_scores = [float(row[3]) for row in coefficients]

variable_names = [variable_names[i] for i, z_score in enumerate(z_scores) if abs(z_score) < 2]
variable_names

In [None]:
#drop insignificant columns from from X_train and X_test
X_tn = X_train.drop(columns=variable_names)
X_ts = X_test.drop(columns=variable_names)
X_tn.shape

#### Models

In [None]:
#Logistic Regression
lr = LogisticRegression()
lr.fit(X_tn, y_train)
y_pred = lr.predict(X_ts)
mse_lr = mean_squared_error(y_test, y_pred)
print('rmse:',np.sqrt(mse_lr))
print("Accuracy is: ", accuracy_score(y_test,y_pred)*100)
print('r2-Score:',lr.score(X_tn, y_train))
print(classification_report(y_test, y_pred))

In [None]:
#decision tree to determine which nodes influence house price
dt = DecisionTreeClassifier(max_depth=50, random_state=42)
dt.fit(X_tn, y_train)
y_pred = dt.predict(X_ts)
mse_dt = mean_squared_error(y_test, y_pred)
print('rmse:',np.sqrt(mse_dt))
print("Accuracy is: ", accuracy_score(y_test,y_pred)*100)
print('r2-Score:',dt.score(X_tn, y_train))
print(classification_report(y_test, y_pred))

In [None]:
plt.figure(figsize=(12,12))
plot_tree(dt, max_depth=3, fontsize=10, feature_names=X_tn.columns)
plt.show()

In [None]:
#list important features used by the dt classifier.
feature_importances = dt.feature_importances_
feature_importance_df = pd.DataFrame({'Feature': X_tn.columns, 'Importance': feature_importances})
feature_importance_df = feature_importance_df.sort_values(by='Importance', ascending=False)
feature_importance_df

In [None]:
# Random Forest
rf = RandomForestClassifier(n_estimators=70, random_state=42)
rf.fit(X_tn, y_train)
y_pred = rf.predict(X_ts)
mse_rf = mean_squared_error(y_test, y_pred)
print('rmse:',np.sqrt(mse_rf))
print("Accuracy is ", accuracy_score(y_test,y_pred)*100)
print('Score:', rf.score(X_tn, y_train))
print(classification_report(y_tests, y_pred))

In [None]:
#support vector machine
svm = SVC()
svm.fit(X_tn, y_train)
y_pred = svm.predict(X_ts)
mse_svm = mean_squared_error(y_test, y_pred)
print('rmse:', np.sqrt(mse_svm))
accuracy = accuracy_score(y_test, y_pred)
print("Accuracy is: ", accuracy * 100)
r2_score = svm.score(X_tn, y_train)
print('r2-Score:', r2_score)
print(classification_report(y_test, y_pred))

In [None]:
#neural network
mlp = MLPClassifier(hidden_layer_sizes=(2,3,4),max_iter=50, random_state=42)
mlp.fit(X_tn, y_train)
y_pred = mlp.predict(X_ts)
mse_nn = mean_squared_error(y_test, y_pred)
print('rmse:',np.sqrt(mse_nn))
print("Accuracy is ", accuracy_score(y_test,y_pred)*100)
print('Score:', mlp.score(X_tn, y_train))
print(classification_report(y_test, y_pred))

In [None]:
#Gradient Boosting Machine
gb = GradientBoostingClassifier(n_estimators=90, random_state=42)
gb.fit(X_tn, y_train)
y_pred = gb.predict(X_ts)
mse_gb = mean_squared_error(y_test, y_pred)
print('rmse:',np.sqrt(mse_gb))
print("Accuracy is: ", accuracy_score(y_test,y_pred)*100)
print('r2-Score:',gb.score(X_tn, y_train))
print(classification_report(y_test, y_pred))