# Lending Club Loan Data 
### Analyzing Lending Club's issued loans
### Code by: Sejal Jaiswal
--> https://www.kaggle.com/wendykan/lending-club-loan-data

In [None]:
import pandas as pd
import numpy as np 
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import LabelEncoder
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.ensemble import ExtraTreesClassifier
from sklearn.feature_selection import SelectFromModel
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold
from sklearn import ensemble
from sklearn.metrics import mean_squared_error
%matplotlib inline
import matplotlib.pyplot as plt
import plotly.plotly as py
import plotly.graph_objs as go
import plotly
from wordcloud import WordCloud
#from wordcloud import STOPWORDS
import tensorflow as tf
from sklearn.neural_network import MLPClassifier
from sklearn import metrics

## Step1: Data understanding

In [None]:
loan_data = pd.read_csv("/Users/Sejal/Desktop/LoanProject/loan.csv", sep = ",")
print("Data shape is: {}".format(loan_data.shape))
print("Column names: {}".format(loan_data.columns))

### Data Visualization - Address

Working with the address data: 'addr_state'

In [None]:
short_state_names = {
        'AK': 'Alaska',
        'AL': 'Alabama',
        'AR': 'Arkansas',
        'AS': 'American Samoa',
        'AZ': 'Arizona',
        'CA': 'California',
        'CO': 'Colorado',
        'CT': 'Connecticut',
        'DC': 'District of Columbia',
        'DE': 'Delaware',
        'FL': 'Florida',
        'GA': 'Georgia',
        'GU': 'Guam',
        'HI': 'Hawaii',
        'IA': 'Iowa',
        'ID': 'Idaho',
        'IL': 'Illinois',
        'IN': 'Indiana',
        'KS': 'Kansas',
        'KY': 'Kentucky',
        'LA': 'Louisiana',
        'MA': 'Massachusetts',
        'MD': 'Maryland',
        'ME': 'Maine',
        'MI': 'Michigan',
        'MN': 'Minnesota',
        'MO': 'Missouri',
        'MP': 'Northern Mariana Islands',
        'MS': 'Mississippi',
        'MT': 'Montana',
        'NA': 'National',
        'NC': 'North Carolina',
        'ND': 'North Dakota',
        'NE': 'Nebraska',
        'NH': 'New Hampshire',
        'NJ': 'New Jersey',
        'NM': 'New Mexico',
        'NV': 'Nevada',
        'NY': 'New York',
        'OH': 'Ohio',
        'OK': 'Oklahoma',
        'OR': 'Oregon',
        'PA': 'Pennsylvania',
        'PR': 'Puerto Rico',
        'RI': 'Rhode Island',
        'SC': 'South Carolina',
        'SD': 'South Dakota',
        'TN': 'Tennessee',
        'TX': 'Texas',
        'UT': 'Utah',
        'VA': 'Virginia',
        'VI': 'Virgin Islands',
        'VT': 'Vermont',
        'WA': 'Washington',
        'WI': 'Wisconsin',
        'WV': 'West Virginia',
        'WY': 'Wyoming'
}

In [None]:
state_grp = loan_data.groupby('addr_state')
mean_values = state_grp.aggregate(np.mean)
max_values = state_grp.aggregate(np.max)
min_values = state_grp.aggregate(np.min)
#print(max_values)

In [None]:
mean_value_loan_amnt = []
max_int_rate = []
min_int_rate = []

#Checking the mean loan amount in each state
for i in state_name: 
    for key in mean_values['loan_amnt'].keys():
        if i == key:
            mean_value_loan_amnt.append(mean_values.loan_amnt[key])

#Checking the maximum interest rate in each state
for i in state_name: 
    for key in max_values['int_rate'].keys():
        if i == key:
            max_int_rate.append(max_values.int_rate[key])
            
#Checking the minimum interest rate in each state
for i in state_name: 
    for key in min_values['int_rate'].keys():
        if i == key:
            min_int_rate.append(min_values.int_rate[key])              

In [None]:
total_sanctioned_loans = loan_data.groupby('addr_state').size()
state_name = []
state_value = []
full_state_name = []

for x,y in total_sanctioned_loans.iteritems():
    state_name.append(x)
    state_value.append(y)
    
for i in state_name: 
    for key in short_state_names.keys():
        if i == key:
            full_state_name.append(short_state_names[key])         

In [None]:
text1 = pd.DataFrame()

#map(float, full_state_name)
text1['full_state_name'] = full_state_name[:]
text1['mean_loan_amnt'] = mean_value_loan_amnt[:]
text1['max_interest_rate'] = max_int_rate[:]
text1['min_interest_rate'] = min_int_rate[:]

for col in text1.columns:
    text1[col] = text1[col].astype(str)
    
text1['text'] = 'State: ' + text1['full_state_name'] + '<br>' + 'Average \'loan_amnt\': ' + text1['mean_loan_amnt'] +\
                '<br>' + 'Max Interest rate: '+ text1['max_interest_rate'] + '<br>' + 'Min Interest rate: ' + text1['min_interest_rate']

In [None]:
scl = [0,"rgb(150,0,90)"],[0.125,"rgb(0, 0, 200)"],[0.25,"rgb(0, 25, 255)"],\
[0.375,"rgb(0, 152, 255)"],[0.5,"rgb(44, 255, 150)"],[0.625,"rgb(151, 255, 0)"],\
[0.75,"rgb(255, 234, 0)"],[0.875,"rgb(255, 111, 0)"],[1,"rgb(255, 111, 0)"]

#addr_state 
data = [ dict(
        type='choropleth',
        colorscale = scl,
        autocolorscale = False,
        locations = state_name,
        z = state_value,
        locationmode = 'USA-states',
        #text = full_state_name,
        text = text1['text'],
        marker = dict(
            line = dict (
                color = 'rgb(255,255,255)',
                width = 2
            ) ),
        colorbar = dict(
            title = "Number of loans sanctioned")
        ) ]

layout = dict(
        title = 'Number of loans sanctioned per state <br>(Hover for breakdown)',
        geo = dict(
            scope='usa',
            projection=dict( type='albers usa' ),
            showlakes = True,
            lakecolor = 'rgb(255, 255, 255)'),
             )

fig = dict( data=data, layout=layout )
#py.iplot(fig, filename='map' )          
#url = py.plot(fig, validate=False, filename='Loans')
plotly.offline.plot(fig, filename='USA_plot')
#py.iplot(fig, filename='Median teacher income by State')

### Rough Analysis
Trying to identify data columns that can be deleted. 

In [None]:
# 'id' AND 'member_id' ARE ALL UNIQUE VALUES
check_col = [x for x in list(loan_data.columns) if x not in ['id','member_id']]
print(check_col)

for i in check_col:
    print(loan_data.groupby(i).size(), "\n")

#### Analysis results
1. Less unique values: term, grade, sub_grade, home_ownership, pymnt_plan, initial_list_status, policy_code (1 group), application_type(2 groups), verification_status_joint(3 groups).
2. 'desc' (description) only important for NLP.
3. Many features with very less data values, hence need to check for NaN values.

### Data Visualization - Null values

In [None]:
nan_data = []
for i in loan_data.columns:
    n_nullvalue = loan_data[str(i)].isnull().sum()
    nan_data.append(n_nullvalue)
    #print(i,n_nullvalue)

In [None]:
data = [go.Bar(x = list(loan_data.columns), y = nan_data, marker=dict(color='#59606D'))]
layout = go.Layout(
    title = 'Number of Missing Values per Feature',
    autosize=False,
    width=1000,
    height=500,
    margin=go.Margin(
        l=50,
        r=50,
        b=250,
        t=50,
        pad=4
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='freq_missing_values')

We can remove columns which provides very less data insight (here, I chose data columns that have less than 30% of the total data) or has only NULL values.
#### Remark:
This is not always a good choice because this column could cater to only a specific type of feature, in our case a identifying feature of 'default clients'

In [None]:
delete_feat = []
for i in check_col:
    if (sum(loan_data.groupby(i).size()) < (0.3 * len(loan_data))):
        #print(i)
        delete_feat.append(i)
print(len(delete_feat))

### Data Visualization - 'loan_status'
Visualising the status of the loan takers.

In [None]:
loan_status_summary = loan_data.groupby('loan_status').size()
print(loan_status_summary)
x_loanstatus = []
y_loanstatus = []
for key,value in loan_status_summary.iteritems():
    x_loanstatus.append(key)
    y_loanstatus.append(value)
    
trace = go.Pie(labels = x_loanstatus, values = y_loanstatus)
layout = go.Layout(title='Loan Status Values')
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(fig, filename='loan_status_graph')    

#### Displaying the 'loan_status' in terms of final prediction values: Defaulter or Non-defaulter

In [None]:
defaulter = 0
non_defaulter = 0
for i in loan_data.loan_status:
    if(i in ['Default','Charged Off','Does not meet the credit policy. Status:Charged Off','Late (31-120 days)']):
        defaulter+=1
    else:
        non_defaulter+=1
print(defaulter)
print(non_defaulter)

trace = go.Pie(labels = ['Defaulters', 'Non-Defaulters'], values = [defaulter,non_defaulter])
layout = go.Layout(title='Defaulters Vs Non-defaulters')
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(fig, filename='defaulters_nondefaulters')

### Data Visualization - 'emp_title'
Visualising the professions of the loan takers

In [None]:
emp_title_summary = loan_data.groupby('emp_title').size()

#'Teacher' AND 'teacher' ARE LISTED AS TWO DIFFERENT PROFESSIONS, UNITE THEM.
emp_title_summary['Teacher'] = emp_title_summary['Teacher'] + emp_title_summary['teacher']
emp_title_summary['teacher'] = 0
emp_title_summary = emp_title_summary.sort_values(ascending=False)
print(emp_title_summary)

x_professions = []
y_n_professions = []
for key,value in emp_title_summary.iteritems():
    x_professions.append(key)
    y_n_professions.append(value) 

There are a lot of professions listed. Hence, we only look at a few common ones.

In [None]:
trace = go.Pie(labels = x_professions[:30], values = y_n_professions[:30])
layout = go.Layout(title=('Professions <br> (Hover for breakdown)'))
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(fig, filename='professions')

### Data Visualization - 'title' 
Visualising the major reason why people took the loan, in terms of a word cloud.

In [None]:
words = list()
words.append(loan_data.title.tolist())
sentence = ''.join(' %s' % word for word in words[0])
STOPWORDS.add('consolidation')
STOPWORDS.add('Consolidation')

In [None]:
#wordcloud = WordCloud(min_font_size=14, background_color='white',).generate(sentence)
wordcloud = WordCloud(font_path=None, width=2000, height=1000, margin=4, ranks_only=None,
                   prefer_horizontal=0.7, scale=1, max_words=2000, min_font_size=15,
                   stopwords=STOPWORDS, random_state=None, background_color='white', max_font_size=None,
                   font_step=1, mode='RGB', relative_scaling=0).generate(sentence[:1000000])
plt.figure(figsize=(15, 10))
plt.imshow(wordcloud, interpolation="bilinear")
plt.axis("off")
plt.show()

In [None]:
reason_summary = loan_data.groupby('title').size()

x_reason = []
y_n_reason = []
for key,value in reason_summary.iteritems():
    x_reason.append(key)
    y_n_reason.append(value) 

trace = go.Pie(labels = x_reason[:30], values = y_n_reason[:30])
layout = go.Layout(title=('Reason for taking Loan <br> (Hover for breakdown)'))
fig = go.Figure(data=[trace], layout=layout)
plotly.offline.plot(fig, filename='reason_loan')

## Step 2.  Data Transformation

Creating a new dataframe to work with: 'data2'.
Delete some columns:
1. Deleting the columns that provide less data insight.
2. Deleting columns with high frequency of Null values (not now, but maybe after feature importance calculation)

In [None]:
data2 = loan_data[:]
data2.drop(delete_feat, inplace=True, axis=1)

#CAN FURTHER REMOVE SOME COLUMNS ACCORDING TO ROUGH ANALYSIS
data2.drop('acc_now_delinq', inplace=True, axis=1) #Mostly always 0
data2.drop('policy_code', inplace=True, axis=1) #Always 1

#DELETING 'desc', 'url', 'title'. BETTER FOR NLP OR SEMANTIC WEB MINING
data2.drop('url', inplace=True, axis=1)
data2.drop('desc', inplace=True, axis=1)
data2.drop('title', inplace=True, axis=1)

In [None]:
#CONVERT 'term' FROM STRING TO INT, REMOVING THE 'months' WORD
data2.term = loan_data.term.str.split(" ").str[1]

#REMOVE '%' FROM 'int_rate' TO CONVERT IT INTO NUMBERIC DATA
#print(data2.int_rate.dtypes)
data2.int_rate = loan_data['int_rate'].str.split("%").str[0]
data2.int_rate = data2.int_rate.astype(float)/100.0

In [None]:
print(list(data2.select_dtypes(include=['object']).columns))

for i in list(data2.select_dtypes(include=['object']).columns):
    data2.i = pd.to_numeric(data2[str(i)],errors='ignore')
print(list(data2.select_dtypes(include=['object']).columns))

In [None]:
#WORKING WITH 'emp_length' VALUE
data2.emp_length = data2['emp_length'].str.extract('(\d+)').astype(float)
#print(data2.emp_length)
data2.emp_length.isnull().values.any()

#NULL VALUES PRESENT IN THE COLUMN, HENCE FILLING IT WITH MEDIAN VALUE
data2.emp_length = data2.emp_length.fillna(data2.emp_length.median())

Deleting all columns that have 'NaN' value.
#### Remark: 
Not a good choice always, but we shall get back to this when modeling if the results aren't good (indicating maybe we dropped some important features).

In [None]:
#CHECKING WHAT COLUMNS HAVE 'NaN' VALUES
nan_column = []
for i in data2.columns:
    if (data2[str(i)].isnull().any().any()):
        #print(i)
        nan_column.append(i)
print(nan_column)

data2.drop(nan_column, inplace=True, axis=1)

Applying Label encoding for other features (columns) to convert into numerical values. <br>
Doing so, allows us to use these features to fed for feature importance calculations.

In [None]:
print(list(data2.select_dtypes(include=['object']).columns))
print(data2.columns)

In [None]:
lb_make = LabelEncoder()

#grade
data2['grade'] = lb_make.fit_transform(data2['grade'])
#print(data2.groupby(data2.grade).size())

data2['sub_grade'] = lb_make.fit_transform(data2['sub_grade'])

#Not too profitable since there are still many classes/groups
#data2['emp_title'] = data2['emp_title'].factorize()[0]
#data2['emp_title'] = lb_make.fit_transform(data2['emp_title'])

#home_ownership: [(ANY,0),(MORTGAGE,1),(NONE,2),(OTHER,3),(OWN,4),(RENT,5)]
data2.home_ownership = lb_make.fit_transform(data2.home_ownership)

#issue_d
data2.issue_d = lb_make.fit_transform(data2.issue_d)

#verification_status : [(Not Verified,0),(Source Verified,1),(Verified,2)]
data2.verification_status = lb_make.fit_transform(data2.verification_status)

#loan_status : [(Not Charged Off,0),(Current,1),(Default,2),(Does not meet the credit policy. Status:Charged Off,3),(Does not meet the credit policy. Status:Fully Paid ,4),(Fully Paid,5),(In Grace Period,6),(Issued,7),(Late (16-30 days),8),(Late (31-120 days),9)]
data2.loan_status = lb_make.fit_transform(data2.loan_status)

#pymnt_plan : [(n,0),(y,1)]
data2.pymnt_plan = lb_make.fit_transform(data2.pymnt_plan)

#purpose : [(car,0),(credit_card,1),(debt_consolidation,2),(educational,3),(home_improvement,4),(house,5),(major_purchase,6),(medical,7),(moving,8),(other,9),(renewable_energy,10),(small_business,11),(vacation,12),(wedding,13)]
data2.purpose = lb_make.fit_transform(data2.purpose)

#zip_code (935)
data2.zip_code = lb_make.fit_transform(data2.zip_code)

#addr_state
data2.addr_state = lb_make.fit_transform(data2.addr_state)

#earliest_cr_line
#Nan values are being factorized and assigned -1 value
#data2['earliest_cr_line'] = data2['earliest_cr_line'].factorize()[0]
#data2.earliest_cr_line = lb_make.fit_transform(data2.earliest_cr_line)

#initial_list_status : [(f,0),(w,1)]
data2.initial_list_status = lb_make.fit_transform(data2.initial_list_status)

#last_pymnt_d 
#data2['last_pymnt_d'] = data2['last_pymnt_d'].factorize()[0]
#data2.last_pymnt_d = lb_make.fit_transform(data2.last_pymnt_d)

#next_pymnt_d
#data2['next_pymnt_d'] = data2['next_pymnt_d'].factorize()[0]
#data2.next_pymnt_d = lb_make.fit_transform(data2.next_pymnt_d)

#last_credit_pull_d
#data2['last_credit_pull_d'] = data2['last_credit_pull_d'].factorize()[0]
#data2.last_credit_pull_d = lb_make.fit_transform(data2.last_credit_pull_d)

#application_type : [(INDIVIDUAL,0),(JOINT,1)]
data2.application_type = lb_make.fit_transform(data2.application_type)

#verification_status_joint : [(Not Verified,0),(Source Verified,1),(Verified,2)]
#data2['verification_status_joint'] = data2['verification_status_joint'].factorize()[0]
#data2.verification_status_joint = lb_make.fit_transform(data2.verification_status_joint)

X -> Features to use <br>
Y -> Prediction value

In [None]:
Y = data2['loan_status']
X_unscaled = data2[:]
X_unscaled.drop('loan_status', inplace = True, axis = 1)
X = preprocessing.scale(X_unscaled)
#print(X_unscaled)

### Data Split (Training and Testing)
70% -> Training <br>
30% -> Testing

In [None]:
#Splitting into test and training dataset
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.3, shuffle = True)
#print(X_train.shape)
#print(X_test.shape)

### Identifying Feature Importance

In [None]:
clf = ExtraTreesClassifier(random_state=10)
clf = clf.fit(X_train, y_train)
importances = clf.feature_importances_

In [None]:
features_to_drop = (importances <= 0.01)
features_indexes, = np.where(features_to_drop == True)
print(features_indexes)

In [None]:
data = [go.Bar(x = data2.columns, y = clf.feature_importances_)]
layout = go.Layout(
    title='Feature Importances',
    autosize=False,
    width=1000,
    height=500,
    margin=go.Margin(
        l=50,
        r=50,
        b=250,
        t=50,
        pad=4
    )
)
fig = go.Figure(data=data, layout=layout)
plotly.offline.plot(fig, filename='feature_importances')

In [None]:
model = SelectFromModel(clf, threshold = 0.01, prefit=True)
dataset = model.transform(X)
dataset.shape

## Step 3.  Modeling

### Target Conversion into Binary Classes
Defaulter (0) -> (Charged Off,0), (Default,2), (Does not meet the credit policy. Status:Charged Off,3), (Late (31-120 days),9)
<br> Non_defaulter (1)

In [None]:
Y_binary = []
for i in Y:
    if(i in [2,3,9,0]):
        Y_binary.append(0)
    else:
        Y_binary.append(1)

In [None]:
Y_binary = np.array(Y_binary)

### Model 1. Random Forest
Here Y is taken into different classes and not binary classes.

In [None]:
rfc = RandomForestClassifier(n_estimators = 15)
k_fold = KFold(n_splits = 4)

for train_ind, test_ind in k_fold.split(dataset):
    X_train, X_test = dataset[train_ind], dataset[test_ind]
    y_train, y_test = Y_binary[train_ind], Y_binary[test_ind]
    rfc = rfc.fit(X_train, y_train)
    y_pred = rfc.predict(X_test)
    y_score = rfc.predict_proba(X_test)[:,1]
        
    print("accuracy : ",rfc.score(X_test, y_test)*100)

In [None]:
cols = ['model','matthews_corrcoef', 'roc_auc_score', 'precision_score', 'recall_score','f1_score']

models_report = pd.DataFrame(columns = cols)
conf_matrix = dict()

#print('computing {} - {} '.format(clf_name, model_type))
tmp = pd.Series({'model': 'Random Forest Classifier',
                 'roc_auc_score' : metrics.roc_auc_score(y_test, y_score),
                 'matthews_corrcoef': metrics.matthews_corrcoef(y_test, y_pred),
                 'precision_score': metrics.precision_score(y_test, y_pred),
                 'recall_score': metrics.recall_score(y_test, y_pred),
                 'f1_score': metrics.f1_score(y_test, y_pred)})

models_report = models_report.append(tmp, ignore_index = True)
conf_matrix[rfc] = pd.crosstab(y_test, y_pred, rownames=['True'], colnames= ['Predicted'], margins=False)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score, drop_intermediate = False, pos_label = 1)

plt.figure(1, figsize=(6,6))
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
#plt.title('ROC curve - {}'.format(model_type))
plt.plot(fpr, tpr, label = 'RFC')
plt.legend(loc=2, prop={'size':11})
plt.plot([0,1],[0,1], color = 'black')
    
print(models_report, conf_matrix)

In [None]:
#Splitting into test and training dataset
X_train, X_test, y_train, y_test = train_test_split(dataset, Y_binary, test_size=0.3, shuffle = True)

### Model 2. Gradient Boosting regression


In [None]:
params = {'n_estimators': 1000, 'max_depth': 4, 'min_samples_split': 3, 'learning_rate': 0.01, 'loss': 'ls'}
clf = ensemble.GradientBoostingRegressor(**params)

clf.fit(X_train, y_train)
mse = mean_squared_error(y_test, clf.predict(X_test))
print("MSE: %.4f" % mse)
print(clf.score(X_test, y_test)*100)

### Model 3. Neural Networks (Multi layer perceptron)


In [None]:
mlp = MLPClassifier(solver='sgd', alpha=1e-5, hidden_layer_sizes=(15,10,5),random_state=1, learning_rate = 'adaptive')
#mlp.fit(X_train, y_train)
#print(mlp.score(X_test,y_test))

k_fold = KFold(n_splits = 4)

for train_ind, test_ind in k_fold.split(dataset):
    X_train, X_test = dataset[train_ind], dataset[test_ind]
    y_train, y_test = Y_binary[train_ind], Y_binary[test_ind]
    
    mlp = mlp.fit(X_train, y_train)
    y_pred = mlp.predict(X_test)
    y_score = mlp.predict_proba(X_test)[:,1]
    print("accuracy : ", mlp.score(X_test, y_test)*100)

In [None]:
cols = ['model','matthews_corrcoef', 'roc_auc_score', 'precision_score', 'recall_score','f1_score']

models_report = pd.DataFrame(columns = cols)
conf_matrix = dict()

tmp = pd.Series({'model': 'Multi layer Perceptron',
                 'roc_auc_score' : metrics.roc_auc_score(y_test, y_score),
                 'matthews_corrcoef': metrics.matthews_corrcoef(y_test, y_pred),
                 'precision_score': metrics.precision_score(y_test, y_pred),
                 'recall_score': metrics.recall_score(y_test, y_pred),
                 'f1_score': metrics.f1_score(y_test, y_pred)})

models_report = models_report.append(tmp, ignore_index = True)
conf_matrix[mlp] = pd.crosstab(y_test, y_pred, rownames=['True'], colnames= ['Predicted'], margins=False)
fpr, tpr, thresholds = metrics.roc_curve(y_test, y_score, drop_intermediate = False, pos_label = 1)

plt.figure(1, figsize=(6,6))
plt.xlabel('false positive rate')
plt.ylabel('true positive rate')
#plt.title('ROC curve - {}'.format(model_type))
plt.plot(fpr, tpr, label = 'MLP')
plt.legend(loc=2, prop={'size':11})
plt.plot([0,1],[0,1], color = 'black')
    
print(models_report, conf_matrix)

### Model 4. Neural Network (Tensorflow)

In [None]:
n_targets = len(np.unique(Y_binary))

#Our Learning  Parameters
learning_rate = 0.001
num_epochs = 8
batch_size = 100
display_step = 1

In [None]:
def weight_variable(shape):
  initial = tf.truncated_normal(shape, stddev=0.1)
  return tf.Variable(initial)

def bias_variable(shape):
  initial = tf.constant(0.1, shape=shape)
  return tf.Variable(initial)

In [None]:
x = tf.placeholder(tf.float32,[None,15])
y = tf.placeholder(tf.float32,[None,2])
w1 = weight_variable([15,200])
b1 = bias_variable([200])
w2 = weight_variable([200,500])
b2 = weight_variable([500])

sm_w = weight_variable([500,n_targets])
sm_b = bias_variable([n_targets])

In [None]:
affine = tf.nn.relu(tf.matmul(x,w1)+b1)
affine2 = tf.nn.relu(tf.matmul(affine,w2)+b2)
drop_prob = tf.placeholder("float")
drop_out = tf.nn.dropout(affine2, drop_prob)
sm_affine = tf.matmul(drop_out,sm_w)+sm_b
sm = tf.nn.softmax(sm_affine)

In [None]:
#Loss function
cost = -tf.reduce_sum(y*tf.log(sm))
optimizer = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(cost)

In [None]:
correct_prediction = tf.equal(tf.argmax(sm, 1), tf.argmax(y, 1))
#Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))

In [None]:
init=tf.initialize_all_variables()

In [None]:
#Splitting into test and training dataset
X_train, X_test, y_train, y_test = train_test_split(dataset, Y_binary, test_size=0.3, shuffle = True)

In [None]:
saver = tf.train.Saver()
with tf.Session() as session:
    session.run(init) # initalize
    for i in range(num_epochs):
        batch_size=100
        nb = X_train.shape[0]/ batch_size;
        print("epoch : ", i+1)
        pre_batch =0
        for j in range(int(nb)):
            if batch_size <= X_train.shape[0]:
                xs = X_train[pre_batch:batch_size]
                ys = y_train[pre_batch:batch_size]
                ys = ys.reshape((100,1))
                pre_batch = batch_size
                batch_size += 100
                #print batch_size
                session.run(optimizer,feed_dict = {x:xs,y:ys,drop_prob:0.5})
                cost_per_batch = session.run(cost,feed_dict = {x:xs,y:ys,drop_prob:0.5})
            
    print("Optimization Finished")
    saver.save(session,'model.ckpt')
    y_test = y_test.reshape((y_test.shape[0],1))
    #print (correct_prediction.eval({x: X_test, y: y_test,drop_prob :1.0}))
    print ("Accuracy:", accuracy.eval({x: X_test, y: y_test,drop_prob :1.0})*100)