# Data Science in Finance and Insurance - Project

In [1]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt;
from sklearn.preprocessing import StandardScaler
from sklearn.decomposition import PCA

# Data Preprocessing

In [2]:
bdata = pd.read_csv("group_project_data.csv")
#Var[X_{Net Income Flag}] = 0, drop it
bdata = bdata.drop([' Net Income Flag'], axis = 1)

#Correlation Matrix
corrMat = bdata.corr()

#create color
fig,ax = plt.subplots(1,1, figsize = (20, 20))
cax = ax.matshow(corrMat, interpolation = 'nearest', cmap = "coolwarm")
fig.colorbar(cax)

list_cols = list(bdata.columns.values)
ax.set_xticks([i for i in range(len(list_cols))], labels = list_cols)
ax.set_yticks([i for i in range(len(list_cols))], labels = list_cols)
ax.tick_params(axis = 'x', labelrotation = 90)

plt.show()

FileNotFoundError: [Errno 2] No such file or directory: 'group_project_data.csv'

In [None]:
# Find variables most correlated with Bankruptcy
lb_abs_corr = .1

bankCorr = corrMat['Bankrupt?'].sort_values(ascending = True)

negbankCorr = bankCorr[np.where(bankCorr <= -1 * lb_abs_corr, True, False)]

posbankCorr = bankCorr[np.where(bankCorr >= lb_abs_corr, True, False)][:-1]

print("Number of Relevant Variables:", negbankCorr.shape[0] + posbankCorr.shape[0])

In [None]:
important_variables = np.hstack((negbankCorr.index.values, posbankCorr.index.values))
important_variables

In [None]:
#Seperate X and Y
X = bdata.drop('Bankrupt?', axis = 1)
y = bdata['Bankrupt?']

## Principal Component Analysis

### PCA - Without Feature Selection

In [None]:
#Center and scale Data
X_std = StandardScaler().fit_transform(X)

#Implement PCA
pca = PCA().fit(X_std)
pca_data = pca.transform(X_std)

#graph explained variance
explained_var = np.cumsum(pca.explained_variance_)/np.sum(pca.explained_variance_)
x = np.array([component for component in range(len(explained_var))])
plt.plot(x, explained_var)
plt.xlabel("Number of PCA components")
plt.ylabel("Cumulative Variance Explained")
plt.title("Variance explained by Number of PCA components")
plt.show()

In [None]:
#Explained variance ratio determined
var_ratio = .9
num_components = explained_var[np.where(explained_var < var_ratio, True, False)].shape[0]
#New Data generated
new_data = pca_data[:,0:num_components].shape

### PCA - With Feature Selection

In [None]:
#Center and scale data
X_sub = X[important_variables]
X_sub_std = StandardScaler().fit_transform(X_sub)

#Implement PCA
pca_sub = PCA().fit(X_sub_std)
pca_data_sub = pca_sub.transform(X_sub_std)

#graph explained variance
explained_var_sub = np.cumsum(pca_sub.explained_variance_)/np.sum(pca_sub.explained_variance_)
x_sub = np.array([component for component in range(len(explained_var_sub))])
plt.plot(x_sub, explained_var_sub)
plt.xlabel("Number of PCA components")
plt.ylabel("Cumulative Variance Explained")
plt.title("Variance explained by Number of PCA components")
plt.show()

# Algorithm Implementation

## K-Nearest Neighbors

In [None]:
from sklearn.model_selection import train_test_split, cross_val_score, GridSearchCV
from sklearn.neighbors import KNeighborsClassifier
from sklearn.metrics import accuracy_score
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler

In [None]:
# Split the data into training and testing sets (80% train, 20% test)
X_train, X_test, y_train, y_test = train_test_split(X_sub_std, y, test_size=0.2, random_state=42)


In [None]:
accuracy_dict = {}

# Define the hyperparameter grid for k
param_grid = {'n_neighbors': range(1, 21)}  # You can adjust the range as needed

# Loop over the number of neighbors (K)
for k in param_grid['n_neighbors']:
    # Implement KNN with cross-validation
    knn_model = KNeighborsClassifier(n_neighbors=k)
    
    # Use GridSearchCV for cross-validated hyperparameter tuning
    grid_search = GridSearchCV(knn_model, param_grid={'n_neighbors': [k]}, cv=5)
    grid_search.fit(X_train, y_train)
    
    # Store the best k and its corresponding accuracy
    best_accuracy = grid_search.best_score_
    accuracy_dict[k] = best_accuracy

# Find the optimal K with the highest accuracy
optimal_k = max(accuracy_dict, key=accuracy_dict.get)

# Visualize the results
plt.plot(param_grid['n_neighbors'], list(accuracy_dict.values()), marker='o')
plt.xlabel("Number of Neighbors (K)")
plt.ylabel("Cross-validated Accuracy")
plt.title("Cross-validated Accuracy for Different Number of Neighbors (K)")
plt.show()

print(f"Optimal Number of Neighbors (K): {optimal_k}")
print(f"Cross-validated Accuracy with Optimal K: {accuracy_dict[optimal_k]:.2f}")

# Train the final model with the optimal K
knn_model_final = KNeighborsClassifier(n_neighbors=optimal_k)
knn_model_final.fit(X_train, y_train)

# Evaluate the final model on the test set
y_pred = knn_model_final.predict(X_test)
accuracy_final = accuracy_score(y_test, y_pred)

print(f"Accuracy on the test set with optimal K: {accuracy_final:.2f}")

## Linear (and Quadratic) Discriminant Analysis

## Logistic Regression

## Naive Bayes

## Decision Trees (and Random Forests)