# Imports

In [0]:
#!pip install tensorflow-gpu

In [0]:
import pandas as pd
import numpy as np
from functools import reduce
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.model_selection import RandomizedSearchCV, GridSearchCV
from sklearn.preprocessing import StandardScaler
import xgboost as xgb
from sklearn.model_selection import cross_val_score, KFold
import string
from sklearn.preprocessing import LabelEncoder
from sklearn.svm import SVC
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import chi2
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA
import datetime

In [0]:
import tensorflow as tf
from tensorflow import keras

from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout
from tensorflow.keras.optimizers import RMSprop
from tensorflow.keras.layers import LeakyReLU

from tensorflow.keras.callbacks import EarlyStopping
from tensorflow.keras.layers import BatchNormalization
print(tf.__version__)

In [0]:
from google.colab import drive
drive.mount("/gdrive")

# Read csv file

In [0]:
bond_ratings_df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/Hackathon/bond_ratings.csv")
fund_allocations_df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/Hackathon/fund_allocations.csv")
fund_config_df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/Hackathon/fund_config.csv")
fund_ratios_df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/Hackathon/fund_ratios.csv")
fund_specs_df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/Hackathon/fund_specs.csv")
other_specs_df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/Hackathon/other_specs.csv")
return_3year_df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/Hackathon/return_3year.csv")
return_5year_df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/Hackathon/return_5year.csv")
return_10year_df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/Hackathon/return_10year.csv")
submission_df = pd.read_csv("/gdrive/My Drive/Colab Notebooks/Hackathon/sample_submission.csv")

# bond_ratings_df

In [0]:
#bond_ratings_df.info()

In [0]:
#bond_ratings_df.describe().T

#### Drop "us_govt_bond_rating" column because all value is zero.

In [0]:
bond_ratings_df.drop(labels = ['us_govt_bond_rating'], axis=1, inplace=True)
bond_ratings_df.info()

In [0]:
#bond_ratings_df.hist(figsize=(15,10), bins=50)

In [0]:
#For few columns median is zero hence fill NaN with mean
for col in bond_ratings_df.columns:
    if col != 'fund_id':
        bond_ratings_df[col] = bond_ratings_df[col].fillna(bond_ratings_df[col].mean())
        #bond_ratings_df[col] = bond_ratings_df[col].fillna(bond_ratings_df[col].median())

#bond_ratings_df.isna().sum()

# fund_allocations_df

In [0]:
#fund_allocations_df.info()

In [0]:
#fund_allocations_df.describe().T

In [0]:
#fund_allocations_df.hist(figsize=(15,10), bins=20)

In [0]:
#Fill Nan with mean for non categorical variable
for col in fund_allocations_df.columns:
    if col != 'fund_id':
        fund_allocations_df[col] = fund_allocations_df[col].fillna(fund_allocations_df[col].median())

#fund_allocations_df.isna().sum()

In [0]:
fund_allocations_df.rename(columns={'id':'tag'},inplace=True)

# fund_config_df

In [0]:
#fund_config_df.info()

In [0]:
print("Number of category: ",fund_config_df['category'].nunique())
print("Number of parent_company: ",fund_config_df['parent_company'].nunique())
print("Number of fund_name: ",fund_config_df['fund_name'].nunique())

In [0]:
fund_config_df.head()

### Apply label encoding: Lots of category present in fund_config_df, if one hot encoding is applied then final dataframe will be become sparse, hence first trying only with label encoding

In [0]:
labelencoder = LabelEncoder()
fund_config_df['category'] = labelencoder.fit_transform(fund_config_df['category'])
fund_config_df['parent_company'] = labelencoder.fit_transform(fund_config_df['parent_company'])
fund_config_df['fund_name'] = labelencoder.fit_transform(fund_config_df['fund_name'])
fund_config_df.head()

In [0]:
#fund_config_df.hist(figsize=(15,10), bins=50)

# fund_ratios_df

In [0]:
#fund_ratios_df.info()

In [0]:
#fund_ratios_df.describe().T

In [0]:
# Change dtype from "object" to "float"
for col in fund_ratios_df.select_dtypes(include='object').columns:
    if col != 'fund_id':
        fund_ratios_df[col] = fund_ratios_df[col].str.replace(',', '')
        fund_ratios_df[col] = fund_ratios_df[col].astype("float64")

In [0]:
#fund_ratios_df.describe().T

In [0]:
#Fill Nan with mean for non categorical variable
for col in fund_ratios_df.columns:
    if col != 'fund_id':
        fund_ratios_df[col] = fund_ratios_df[col].fillna(fund_ratios_df[col].median())

#fund_ratios_df.isna().sum()

In [0]:
#fund_ratios_df.hist(figsize=(15,10), bins=50)

# fund_specs_df

In [0]:
#fund_specs_df.info()

### Convert "inception_date"

In [0]:
fund_specs_df['inception_date'] = pd.to_datetime(fund_specs_df['inception_date'])

### Get Month, day, year, quarter, semester from "inception_date"

In [0]:
fund_specs_df['Inception_Month'] = fund_specs_df['inception_date'].dt.month
fund_specs_df['Inception_Day'] = fund_specs_df['inception_date'].dt.day
fund_specs_df['Inception_Year'] = fund_specs_df['inception_date'].dt.year
fund_specs_df['Dayofweek'] = fund_specs_df['inception_date'].dt.dayofweek

### Quarter and Semester

In [0]:
fund_specs_df['Quarter'] = fund_specs_df['inception_date'].dt.quarter
fund_specs_df['Semester'] = np.where(fund_specs_df['Quarter'].isin([1,2]),1,2)

### Weekend or not

In [0]:
#fund_specs_df['Weekday_name'] = fund_specs_df['inception_date'].dt.weekday_name
#fund_specs_df['is_weekend'] = np.where(fund_specs_df['Weekday_name'].isin(['Sunday','Saturday']),1,0)
#fund_specs_df[['Weekday_name', "is_weekend"]].head()

In [0]:
#fund_specs_df['is_weekend'].value_counts()

### Difference between today and inception data

In [0]:
fund_specs_df["InceptionDate_diff"] = (datetime.datetime.today() - fund_specs_df['inception_date'])
fund_specs_df["InceptionDate_diff"] = fund_specs_df["InceptionDate_diff"]/np.timedelta64(1,'D')

### Apply one hot encoding

In [0]:
Inception_Month = pd.get_dummies(fund_specs_df['Inception_Month'], drop_first=True,prefix="Month")
Dayofweek = pd.get_dummies(fund_specs_df['Dayofweek'], drop_first=True, prefix="Day")
Quarter = pd.get_dummies(fund_specs_df['Quarter'], drop_first=True, prefix="Qua")
Semester = pd.get_dummies(fund_specs_df['Semester'], drop_first=True, prefix="Sem")

### Apply label encoding

In [0]:
labEnc = LabelEncoder()
fund_specs_df['Inception_Day'] = labEnc.fit_transform(fund_specs_df['Inception_Day'])
fund_specs_df['Inception_Year'] = labEnc.fit_transform(fund_specs_df['Inception_Year'])

### Concat categorical variable

In [0]:
fund_specs_df_cat = pd.concat([Inception_Month, Dayofweek, Quarter, Semester], axis=1)
fund_specs_df_cat.shape

In [0]:
#fund_specs_df.hist(figsize=(20,10), bins=50)

In [0]:
#fund_specs_df.describe().T

In [0]:
Target_label = fund_specs_df[['tag', 'greatstone_rating']]
print(Target_label.shape)
Target_label.head()

In [0]:
# Drop target label(greatstone_rating) and inception_date column
#fund_specs_df.drop(labels=['greatstone_rating', 'inception_date', "Weekday_name"], axis=1,inplace=True)
fund_specs_df.drop(labels=['greatstone_rating', 'inception_date', "Inception_Month", "Dayofweek", "Quarter", "Semester"], axis=1,inplace=True)
fund_specs_df.isna().sum()

In [0]:
#Fill Nan with mean for non categorical variable
for col in fund_specs_df.select_dtypes(exclude='object').columns:
    fund_specs_df[col] = fund_specs_df[col].fillna(fund_specs_df[col].median())
    

# Fill NaN with mode for categorical variable
for col in fund_specs_df.select_dtypes(include='object').columns:
    fund_specs_df[col] = fund_specs_df[col].fillna(fund_specs_df[col].mode()[0])

In [0]:
#fund_specs_df.isna().sum()

In [0]:
#fund_specs_df.hist(figsize=(10,8), bins=50)

### Merge target variable to fund_specs_df

In [0]:
fund_specs_df = pd.merge(left=fund_specs_df, right=Target_label, on=['tag'], how='outer')

In [0]:
fund_specs_df.head()

In [0]:
investment_class = pd.get_dummies(fund_specs_df['investment_class'],drop_first=True)
currency = pd.get_dummies(fund_specs_df['currency'],drop_first=True)
fund_size = pd.get_dummies(fund_specs_df['fund_size'],drop_first=True)

print('shape of investment_class: ',investment_class.shape)
print('shape of currency: ',currency.shape)
print('shape of fund_size: ',fund_size.shape)

In [0]:
cat_var_encoded = pd.concat([investment_class, currency, fund_size], axis=1)
cat_var_encoded.shape

### Drop original columns after one hot encoding

In [0]:
fund_specs_df.drop(['investment_class', 'currency', 'fund_size'], axis=1, inplace=True)
fund_specs_df.shape

In [0]:
fund_specs_df.head()

In [0]:
#fund_specs_df.isna().sum()

In [0]:
#fund_specs_df.hist(figsize=(15,10), bins=20)

# other_specs_df

In [0]:
#other_specs_df.info()

In [0]:
#other_specs_df.describe().T

In [0]:
# Change dtype from "object" to "float"
for col in other_specs_df.select_dtypes(include='object').columns:
    other_specs_df[col] = other_specs_df[col].str.replace(',', '')
    other_specs_df[col] = other_specs_df[col].astype("float64")

In [0]:
# Few features has zero median value, hence fill Nan with mean for non categorical variable
for col in other_specs_df.columns:
    other_specs_df[col] = other_specs_df[col].fillna(other_specs_df[col].mean())

#other_specs_df.isna().sum()

In [0]:
#other_specs_df.hist(figsize=(20,30), bins=20, layout=(11,5))

# return_3year_df

In [0]:
#return_3year_df.info()

In [0]:
#return_3year_df.describe().T

In [0]:
# Change dtype from "object" to "float"
for col in return_3year_df.select_dtypes(include='object').columns:
    return_3year_df[col] = return_3year_df[col].str.replace(',', '')
    return_3year_df[col] = return_3year_df[col].astype("float64")

In [0]:
#Fill Nan with mean for non categorical variable
for col in return_3year_df.columns:
    return_3year_df[col] = return_3year_df[col].fillna(return_3year_df[col].median())

#return_3year_df.isna().sum()

In [0]:
#return_3year_df.hist(figsize=(20,30))

### Apply One hot encoding

In [0]:
#annual_mean_3yr = pd.get_dummies(return_3year_df['3_years_return_mean_annual_category'], drop_first=True, prefix="annual_mean_3yr")

#sharpe_ratio_3yr = pd.get_dummies(return_3year_df['3yrs_sharpe_ratio_category'], drop_first=True, prefix="sharpe_ratio_3yr")

#beta_3yr = pd.get_dummies(return_3year_df['category_beta_3years'], prefix="beta_3yr", drop_first=True)

In [0]:
#print("Shape of annual_mean_3yr",annual_mean_3yr.shape)
#print("Shape of sharpe_ratio_3yr",sharpe_ratio_3yr.shape)
#print("Shape of beta_3yr",beta_3yr.shape)

### Concat all categorical columns

In [0]:
#return_3year_df_cat = pd.concat([annual_mean_3yr, sharpe_ratio_3yr, beta_3yr], axis=1)
#return_3year_df_cat.shape

### Drop original columns after one hot encoding

In [0]:
return_3year_df.shape

In [0]:
#return_3year_df.drop(labels=['3_years_return_mean_annual_category', '3yrs_sharpe_ratio_category', 'category_beta_3years'], axis=1,inplace=True)
return_3year_df.shape

# return_5year_df

In [0]:
#return_5year_df.info()

In [0]:
#return_5year_df.describe().T

In [0]:
# Change dtype from "object" to "float"
for col in return_5year_df.select_dtypes(include='object').columns:
    return_5year_df[col] = return_5year_df[col].str.replace(',', '')
    return_5year_df[col] = return_5year_df[col].astype("float64")

In [0]:
#Fill Nan with mean for non categorical variable
for col in return_5year_df.columns:
    return_5year_df[col] = return_5year_df[col].fillna(return_5year_df[col].mean())

#return_5year_df.isna().sum()

In [0]:
#return_5year_df.hist(figsize=(20,30))

### Apply one hot encoding

In [0]:
#annual_mean_5yr = pd.get_dummies(return_5year_df['5_years_return_mean_annual_category'],  drop_first=True, prefix="annual_mean_5yr")

#sharpe_ratio_5yr = pd.get_dummies(return_5year_df['5yrs_sharpe_ratio_category'],  drop_first=True, prefix="sharpe_ratio_5yr")

#beta_5yr = pd.get_dummies(return_5year_df['5_years_beta_category'],drop_first=True, prefix="beta_5yr")

In [0]:
#print("Shape of annual_mean_5yr", annual_mean_5yr.shape)
#print("Shape of sharpe_ratio_5yr", sharpe_ratio_5yr.shape)
#print("Shape of beta_5yr", beta_5yr.shape)

### Concat all categorical columns

In [0]:
#return_5year_df_cat = pd.concat([annual_mean_5yr, sharpe_ratio_5yr, beta_5yr],axis=1)
#return_5year_df_cat.shape

### Drop orginal columns after one hot encoding

In [0]:
return_5year_df.shape

In [0]:
#return_5year_df.drop(labels=['5_years_return_mean_annual_category', '5yrs_sharpe_ratio_category',
#                            '5_years_beta_category'], axis=1, inplace=True)
return_5year_df.shape

# return_10year_df

In [0]:
#return_10year_df.info()

In [0]:
#return_10year_df.describe().T

In [0]:
# Change dtype from "object" to "float"
for col in return_10year_df.select_dtypes(include='object').columns:
    if col != 'fund_id':
        return_10year_df[col] = return_10year_df[col].str.replace(',', '')
        return_10year_df[col] = return_10year_df[col].astype("float64")

#Fill Nan with mean for non categorical variable
for col in return_10year_df.columns:
    if col != 'fund_id':
        return_10year_df[col] = return_10year_df[col].fillna(return_10year_df[col].median())

#return_10year_df.isna().sum()

In [0]:
#return_10year_df.hist(figsize=(20,30),bins=20)

### Apply one hot encoding

In [0]:
#annual_mean_10yr = pd.get_dummies(return_10year_df['10_years_return_mean_annual_category'], drop_first=True, prefix="annual_mean_10yr")
#sharpe_ratio_10yr = pd.get_dummies(return_10year_df['10yrs_sharpe_ratio_category'],  drop_first=True, prefix="sharpe_ratio_10yr")
#beta_10yr = pd.get_dummies(return_10year_df['10_years_beta_category'], drop_first=True, prefix="beta_10yr")

In [0]:
#print("Shape of annual_mean_5yr", annual_mean_10yr.shape)
#print("Shape of sharpe_ratio_5yr", sharpe_ratio_10yr.shape)
#print("Shape of beta_5yr", beta_10yr.shape)

### Cancat all categorical columns

In [0]:
#return_10year_df_cat = pd.concat([annual_mean_10yr, sharpe_ratio_10yr, beta_10yr],axis=1)
#return_10year_df_cat.shape

### Drop original columns after one hot encoding

In [0]:
return_10year_df.shape

In [0]:
#return_10year_df.drop(labels=['10_years_return_mean_annual_category', '10yrs_sharpe_ratio_category', 
#                              '10_years_beta_category'], axis=1, inplace=True)
return_10year_df.shape

### Merge all dataframe except "fund_config_df" and "return_10year_df" dataframe on "tag" column

In [0]:
df_list_1 = [bond_ratings_df, fund_allocations_df,  fund_ratios_df,  fund_specs_df, other_specs_df, 
             return_3year_df, return_5year_df]

In [0]:
df_combined_on_tag = reduce(lambda  left,right: pd.merge(left,right,on=['tag'], how='outer'), df_list_1)
df_combined_on_tag.shape

### Merge "df_combined_on_tag", "fund_config_df" and "return_10year_df" dataframe on "fund_id" column

In [0]:
df_list_2 = [df_combined_on_tag, return_10year_df,  fund_config_df]

In [0]:
merged_df = reduce(lambda  left,right: pd.merge(left,right,on=['fund_id'], how='outer'), df_list_2)
merged_df.shape

In [0]:
#for col in merged_df.columns:
#    print(col)

## Drop duplicate columns after merging and rename

In [0]:
merged_df.drop(columns=['pb_ratio_y', 'ps_ratio_y', 'mmc_y', 'pc_ratio_y', 'pe_ratio_y', 'greatstone_rating_y', 
                        'fund_return_3years_y'],axis=1,inplace=True)

In [0]:
merged_df.rename(columns={"pb_ratio_x":'pb_ratio', 
                          "ps_ratio_x":'ps_ratio', 
                          "mmc_x":"mmc",
                          "pc_ratio_x":"pc_ratio", 
                          "pe_ratio_x":"pe_ratio", 
                          "greatstone_rating_x":"greatstone_rating",
                          "fund_return_3years_x":"fund_return_3years"}, inplace=True)

In [0]:
merged_df.shape

# Merge all categorical and numerical columns

### All numerical columns

In [0]:
all_num_col = merged_df.columns
len(all_num_col)

### All categorical variable

In [0]:
#all_cat_df = pd.concat([cat_var_encoded, return_3year_df_cat, return_5year_df_cat, return_10year_df_cat], axis=1)
#all_cat_df = cat_var_encoded

all_cat_df = pd.concat([cat_var_encoded, fund_specs_df_cat], axis=1)
all_cat_df.shape

In [0]:
all_cat_col = all_cat_df.columns
len(all_cat_col)

### Merged df

In [0]:
merged_df = pd.concat([merged_df, all_cat_df], axis=1)

In [0]:
merged_df.shape

In [0]:
#for col in merged_df.columns:
#    print(col,"------", merged_df[col].dtype)

# Segregate the training & test data based on where the greatstone ratings are provided or not

In [0]:
submission_df.head()

### Perfome inner join on "fund_id", common data will be test data

In [0]:
Test_df = pd.merge(submission_df, merged_df, how='inner', on=['fund_id'])
Test_df.drop(columns=['greatstone_rating_y'],axis=1, inplace=True)
Test_df.rename(columns={'greatstone_rating_x':'greatstone_rating'},inplace=True)
Test_df.shape

# Remove test data from merged dataframe and name that as Train_df

In [0]:
Train_df = merged_df[~merged_df["fund_id"].isin(Test_df["fund_id"])]
Train_df.shape

### Difference between test and train

In [0]:
Train_df.columns.difference(Test_df.columns)

### Check if order of Test_df and and submission_df is same or not ?

In [0]:
Test_df[Test_df['fund_id'] != submission_df['fund_id']]

### Copy submit_df

In [0]:
submit_df = Test_df[['fund_id', 'greatstone_rating']].copy()

In [0]:
submit_df.shape

### Reset Train_df index

In [0]:
Train_df.reset_index(inplace=True,drop=True)

In [0]:
Train_df.index

### Reset Test_df index

In [0]:
Test_df.reset_index(inplace=True, drop=True)

In [0]:
Test_df.index

### Seperate numerical and categorical columns

In [0]:
Train_df_num = Train_df[all_num_col]
Train_df_cat = Train_df[all_cat_col]

In [0]:
print("Train_df: ",Train_df.shape)
print("Train_df_num: ",Train_df_num.shape)
print("Train_df_cal: ",Train_df_cat.shape)

In [0]:
Test_df_num = Test_df[all_num_col]
Test_df_cat = Test_df[all_cat_col]

In [0]:
print("Test_df: ",Test_df.shape)
print("Test_df_num: ",Test_df_num.shape)
print("Test_df_cat: ",Test_df_cat.shape)

### Drop "fund_id" and "tag" from train and test

In [0]:
Train_df_num = Train_df_num.drop(['fund_id', 'tag'], axis=1)
Test_df_num = Test_df_num.drop(['fund_id', 'tag'], axis=1)

In [0]:
print("Train shape after dropping: ", Train_df_num.shape)
print("Test shape after dropping: ", Test_df_num.shape)

# Feature Selection correlation based

#### 1. Compare the correlation between features and remove one of two features having higher correlation.

In [0]:
corrmat = Train_df_num.corr()

In [0]:
corrmat.shape

In [0]:
columns = np.full((corrmat.shape[0],), True, dtype=bool)
columns

In [0]:
len(columns)

In [0]:
for i in range(corrmat.shape[0]):
    for j in range(i+1, corrmat.shape[0]):
        if corrmat.iloc[i,j] >=0.9:
            if columns[j]:
                columns[j] = False

# List of selected columns
selected_columns = Train_df_num.columns[columns]
print("Number of selected columns: ",len(selected_columns))

#### 2. Find features which are highly correlated with target columns

In [0]:
#Train_df_num = Train_df_num[selected_columns]
#Train_df_num.shape

In [0]:
#corr_matrix = Train_df_num.corr()

In [0]:
#corr_matrix.shape

In [0]:
#Correlation with output variable
#corr_target = abs(corr_matrix["greatstone_rating"])

In [0]:
#corr_matrix["greatstone_rating"] = abs(corr_matrix["greatstone_rating"])

In [0]:
# filter column having corr more that threshold
#filtered_corr_mat = corr_matrix[corr_matrix["greatstone_rating"] > 0.1]
#filtered_corr_mat.shape

### List of selected columns

In [0]:
#filtered_corr_mat_T = filtered_corr_mat.T
#selected_columns = filtered_corr_mat_T.columns
#selected_columns

### Create new train and test dataframe based on filtered columns 

In [0]:
Train_df_num.shape

In [0]:
#Train_df_num = Train_df_num[selected_columns]
#Train_df_num.shape

In [0]:
#Test_df_num = Test_df_num[selected_columns]
#Test_df_num.shape

## Get X_train, Y_train and X_test  

In [0]:
## X_train and Y_train
#X_train_full = Train_df_filter.drop(columns='greatstone_rating', axis=1)
#Y_train_full = Train_df_filter['greatstone_rating'].values

X_train = Train_df_num.drop(columns='greatstone_rating', axis=1)
Y_train = Train_df_num['greatstone_rating'].values

In [0]:
print("Full Training data shape: ", X_train.shape, Y_train.shape)

In [0]:
#X_test_full = Test_df_filter.drop(columns=['greatstone_rating'], axis=1)

X_test = Test_df_num.drop(columns=['greatstone_rating'], axis=1)
print("Full Test data: ",X_test.shape)

#### Convert test label in one hot vector

In [0]:
#from tensorflow.keras import utils
#n_classes = len(np.unique(Y_train))
#print("Number of classed: ",n_classes)
#Y_train = utils.to_categorical(Y_train, num_classes=n_classes)
#Y_train

### Data imbalance

In [0]:
class_0 = np.count_nonzero(Y_train == 0.0)
class_1 = np.count_nonzero(Y_train == 1.0)
class_2 = np.count_nonzero(Y_train == 2.0)
class_3 = np.count_nonzero(Y_train == 3.0)
class_4 = np.count_nonzero(Y_train == 4.0)
class_5 = np.count_nonzero(Y_train == 5.0)

print("Number data for each class: ")
print("class_0: ",class_0)
print("class_1: ",class_1)
print("class_2: ",class_2)
print("class_3: ",class_3)
print("class_4: ",class_4)
print("class_5: ",class_5)

In [0]:
#from imblearn.over_sampling import SMOTE
#smote = SMOTE()
#X_train, Y_train = smote.fit_sample(X_train, Y_train)
#print("Training length after smote: ", len(X_train), len(Y_train))

#### Convert test label in one hot vector

In [0]:
class_0 = np.count_nonzero(Y_train == 0.0)
class_1 = np.count_nonzero(Y_train == 1.0)
class_2 = np.count_nonzero(Y_train == 2.0)
class_3 = np.count_nonzero(Y_train == 3.0)
class_4 = np.count_nonzero(Y_train == 4.0)
class_5 = np.count_nonzero(Y_train == 5.0)

print("Number data for each class: ")
print("class_0: ",class_0)
print("class_1: ",class_1)
print("class_2: ",class_2)
print("class_3: ",class_3)
print("class_4: ",class_4)
print("class_5: ",class_5)

In [0]:
from tensorflow.keras import utils
n_classes = len(np.unique(Y_train))
print("Number of classed: ",n_classes)
Y_train = utils.to_categorical(Y_train, num_classes=n_classes)
Y_train

### Apply standar scalar on full train and test data

In [0]:
# StandardScaler
std_scaler = StandardScaler()

std_scaler.fit(X_train)

X_train_scaled = std_scaler.transform(X_train)
X_test_scaled = std_scaler.transform(X_test)

In [0]:
print("X_train_scaled shape: ", X_train_scaled.shape)
print("X_test_scaled shape: ", X_test_scaled.shape)

### Concat all numerical and categorical columns

In [0]:
X_train_scaled = pd.DataFrame(X_train_scaled)
X_test_scaled = pd.DataFrame(X_test_scaled)

#### X_train_scaled combined with categorical features

In [0]:
X_train_scaled = pd.concat([X_train_scaled, Train_df_cat],axis=1)
X_train_scaled.shape

In [0]:
X_train_scaled = X_train_scaled.to_numpy()
X_train_scaled

#### X_test_scaled combined with categorical features

In [0]:
X_test_scaled = pd.concat([X_test_scaled, Test_df_cat],axis=1)
X_test_scaled.shape

In [0]:
X_test_scaled = X_test_scaled.to_numpy()
X_test_scaled

## Apply PCA

In [0]:
#pca = PCA()
#pca.fit(X_train_scaled)

In [0]:
#pca.explained_variance_ratio_

In [0]:
#len(pca.explained_variance_ratio_)

In [0]:
#X_train_pca = pca.transform(X_train_scaled)
#X_test_pca = pca.transform(X_test_scaled)

In [0]:
#print("X_train_pca Shape: ",X_train_pca.shape)
#print("X_test_pca Shape: ",X_test_pca.shape)

# Build Neural Net

In [0]:
keras.backend.clear_session()

In [0]:
model = Sequential()

#Dense layer
model.add(Dense(units=254,  activation='relu'))
model.add(Dense(units=127,  activation='relu'))
model.add(Dense(units=64,  activation='relu'))
model.add(Dense(units=30,  activation='relu'))

#Output layer
model.add(Dense(units=6,  activation='softmax'))


In [0]:
model.compile(loss="categorical_crossentropy", optimizer='adam', metrics=['accuracy'])
model.fit(X_train_scaled, Y_train, epochs=100)

### Training Score

In [0]:
score = model.evaluate(X_train_scaled, Y_train)
print("Training score: ",score)

### Prediction

In [0]:
Y_pred2 = model.predict_classes(X_test_scaled)
Y_pred2

In [0]:
submit_df['greatstone_rating'] = Y_pred2

In [0]:
submit_df.set_index(keys='fund_id',inplace=True)

In [0]:
submit_df.head()

In [0]:
!rm submission_nn.csv

In [0]:
submit_df.to_csv("submission_nn.csv")

In [0]:
!ls

In [0]:
from google.colab import files
files.download("submission_nn.csv")