In [8]:
import os
import pandas as pd
import numpy as np
import warnings
from sklearn.model_selection import train_test_split
warnings.filterwarnings('ignore')

# Loading the dataset and Cleaning dataset

In [14]:
# Code starts here
df = pd.read_csv(r'C:\Users\SHRIK\Desktop\shrikant\Projects\GreyAtom Projects\Cash Deposit Prediction\data\ml2_train.csv')
df.columns = map(str.lower, df.columns)
df.columns = df.columns.str.replace(' ','_')
df.head()
df.replace('NaN',np.nan,inplace=True)
print(df.isnull().sum())

serial_number          0
main_office            0
branch_number          0
established_date       0
acquired_date       1493
city                   0
county                 0
state                  0
2010_deposits        740
2011_deposits        578
2012_deposits        329
2013_deposits        175
2014_deposits         56
2015_deposits         19
2016_deposits          0
dtype: int64


# Handling Time Stamp data


In [None]:
df.set_index(keys='serial_number',inplace=True,drop=True)
df['established_date'] = pd.to_datetime(df['established_date'])
df['acquired_date'] = pd.to_datetime(df['acquired_date'])
print(df.head())
y = df['2016_deposits'] 
X = df.iloc[:,:-1]
X_train,X_val,y_train,y_val = train_test_split(X,y,test_size = 0.25, random_state = 3 )

# Generate a feature of age of funds from inception date

In [None]:
time_col = ['established_date', 'acquired_date']
# Code starts from here
for col_name in time_col:
    new_col_name = "since_"+col_name
    X_train[new_col_name] = pd.datetime.now() - X_train[col_name]
    X_train[new_col_name] = X_train[new_col_name].apply(lambda x: float(x.days)/365)
    X_train.drop(columns=col_name,inplace=True) 
    X_val[new_col_name] = pd.datetime.now() - X_val[col_name]
    X_val[new_col_name] = X_val[new_col_name].apply(lambda x: float(x.days)/365)
    X_val.drop(columns=col_name,inplace=True)
print(X_train.head())

# Encoding and filling missing values


In [None]:
from sklearn.preprocessing import LabelEncoder
cat = X_train.select_dtypes(include='O').columns.tolist()

# Missing values

X_train = X_train.fillna(0)
X_val = X_val.fillna(0)

# Label encoding

le = LabelEncoder()

for x in cat:
    
    X_train[x] = le.fit_transform(X_train[x])
    
    X_val[x] = le.fit_transform(X_val[x])
    
# One hot encoding

X_train_temp = pd.get_dummies(data = X_train, columns = cat)
X_val_temp = pd.get_dummies(data = X_val, columns = cat)

# Shape of train and test data
print(X_train_temp.shape,X_val_temp.shape)

# Observation
Some of the models like Linear Regression requires the data to be one hot encoded for the model to work efficiently. The shape error we faced in the previous task while applying Linear regression was due to one hot encoding applied on the dataset. Some categories present in the train data set were not present in the test data set and so the shape of both the dataframes did not match. So one of the methods to tackle this is that to use tree based algorithm. A major advantage of decision tree models and their ensemble counterparts, random forests, is that they are able to operate on both continuous and categorical variables directly. In contrast, most other popular models (e.g.,Linear regresion) must instead transform categorical variables into some numerical analog, usually by one-hot encoding them to create a new dummy variable.

# Decision Tree
We have learned about how Decision tree algorithm works for classification. We can use Decision tree algorithm for regression as well. We will use DecisionTreeRegressor from sklearn library to do so.

In [None]:
from sklearn.tree import DecisionTreeRegressor
from sklearn.metrics import mean_squared_error
dt =DecisionTreeRegressor(random_state = 5)
dt.fit(X_train,y_train)
accuracy = dt.score(X_val,y_val)
print(accuracy)
y_pred = dt.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val,y_pred))
print("RMSE:",rmse)

## XGBoost

In [None]:
from xgboost import XGBRegressor
xgb = XGBRegressor(max_depth=50,learning_rate=0.83,n_estimators=100)
xgb.fit(X_train,y_train)
accuracy = xgb.score(X_val,y_val)
print(accuracy)
y_pred = xgb.predict(X_val)
rmse = np.sqrt(mean_squared_error(y_val,y_pred))
print("RMSE:",rmse)