In [2]:
import pandas as pd
import numpy as np
from sklearn import preprocessing
from sklearn import tree

# Survived Analysis (1)

In [3]:
df = pd.read_csv("train.csv")

In [4]:
df.head()

Unnamed: 0,PassengerId,Survived,Pclass,Name,Sex,Age,SibSp,Parch,Ticket,Fare,Cabin,Embarked
0,1,0,3,"Braund, Mr. Owen Harris",male,22.0,1,0,A/5 21171,7.25,,S
1,2,1,1,"Cumings, Mrs. John Bradley (Florence Briggs Th...",female,38.0,1,0,PC 17599,71.2833,C85,C
2,3,1,3,"Heikkinen, Miss. Laina",female,26.0,0,0,STON/O2. 3101282,7.925,,S
3,4,1,1,"Futrelle, Mrs. Jacques Heath (Lily May Peel)",female,35.0,1,0,113803,53.1,C123,S
4,5,0,3,"Allen, Mr. William Henry",male,35.0,0,0,373450,8.05,,S


In [5]:
le = preprocessing.LabelEncoder()

df.Sex = le.fit_transform(df["Sex"])

In [6]:
from sklearn.ensemble import RandomForestClassifier

In [8]:
df.columns

Index(['PassengerId', 'Survived', 'Pclass', 'Name', 'Sex', 'Age', 'SibSp',
       'Parch', 'Ticket', 'Fare', 'Cabin', 'Embarked'],
      dtype='object')

In [9]:
rf_model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [10]:
features = ["Age","Sex","Fare","Pclass","SibSp"]

In [11]:
rf_model.fit(X=df[features], y=df.Survived)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [12]:
print("OOB Accuracy : ")
print(rf_model.oob_score_)

OOB Accuracy : 
0.8110236220472441


In [13]:
# Now Getting IMP Features

for feature,imp in zip(features, rf_model.feature_importances_):
    print(feature,imp)

Age 0.27262674686966404
Sex 0.27454369904810266
Fare 0.3125585810939026
Pclass 0.08943011819447104
SibSp 0.05084085479385896


## Now we got important variables as Sex, Age and Fare

In [14]:
tree_model = tree.DecisionTreeClassifier()

In [15]:
feature = ["Age","Sex","Fare"]

In [16]:
tree_model.fit(X=df[feature], y=df.Survived)

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=None,
            max_features=None, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

## For Visualization

In [18]:
with open("survived.dot","w") as f:
    f = tree.export_graphviz(tree_model, feature_names=["Age","Sex","Fare"], out_file=f);

## Prediction

In [19]:
df1= pd.read_csv("test.csv")

In [20]:
df1["Sex"] = le.fit_transform(df1.Sex)

In [21]:
test_features = pd.DataFrame([df1["Age"], df1["Sex"], df1["Fare"]]).T

In [22]:
test_prediction = tree_model.predict(X=test_features)

In [23]:
preds = pd.DataFrame({"Passenger ID":df1['PassengerId'], "Survived":test_prediction})

In [25]:
preds.to_csv("output.csv", index=False)

# Attrition analysis (2)

In [27]:
data = pd.read_csv("general_data.csv")

In [28]:
data.head()

Unnamed: 0,Age,Attrition,BusinessTravel,Department,DistanceFromHome,Education,EducationField,EmployeeCount,EmployeeID,Gender,...,NumCompaniesWorked,Over18,PercentSalaryHike,StandardHours,StockOptionLevel,TotalWorkingYears,TrainingTimesLastYear,YearsAtCompany,YearsSinceLastPromotion,YearsWithCurrManager
0,51,No,Travel_Rarely,Sales,6,2,Life Sciences,1,1,Female,...,1.0,Y,11,8,0,1.0,6,1,0,0
1,31,Yes,Travel_Frequently,Research & Development,10,1,Life Sciences,1,2,Female,...,0.0,Y,23,8,1,6.0,3,5,1,4
2,32,No,Travel_Frequently,Research & Development,17,4,Other,1,3,Male,...,1.0,Y,15,8,3,5.0,2,5,0,3
3,38,No,Non-Travel,Research & Development,2,5,Life Sciences,1,4,Male,...,3.0,Y,11,8,3,13.0,5,8,7,5
4,32,No,Travel_Rarely,Research & Development,10,1,Medical,1,5,Male,...,4.0,Y,12,8,2,9.0,2,6,0,4


In [30]:
from sklearn.model_selection import train_test_split

In [31]:
data["Attrition"] = le.fit_transform(data.Attrition)
data["BusinessTravel"] = le.fit_transform(data.BusinessTravel)
data["Department"] = le.fit_transform(data.Department)
data["EducationField"] = le.fit_transform(data.EducationField)
data["Gender"] = le.fit_transform(data.Gender)
data["Over18"] = le.fit_transform(data.Over18)
data["JobRole"]= le.fit_transform(data.JobRole)
data["MaritalStatus"]= le.fit_transform(data.MaritalStatus)

In [32]:
data = data.fillna(method='bfill')

data = data.fillna(method='ffill')

In [33]:
data.isna().sum()

Age                        0
Attrition                  0
BusinessTravel             0
Department                 0
DistanceFromHome           0
Education                  0
EducationField             0
EmployeeCount              0
EmployeeID                 0
Gender                     0
JobLevel                   0
JobRole                    0
MaritalStatus              0
MonthlyIncome              0
NumCompaniesWorked         0
Over18                     0
PercentSalaryHike          0
StandardHours              0
StockOptionLevel           0
TotalWorkingYears          0
TrainingTimesLastYear      0
YearsAtCompany             0
YearsSinceLastPromotion    0
YearsWithCurrManager       0
dtype: int64

In [34]:
data.columns

Index(['Age', 'Attrition', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField', 'EmployeeCount', 'EmployeeID', 'Gender',
       'JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked', 'Over18', 'PercentSalaryHike', 'StandardHours',
       'StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager'],
      dtype='object')

In [35]:
data = data.drop(['Over18','EmployeeCount', 'EmployeeID','StandardHours'],axis=1)

In [36]:
data = data.dropna()
data1 = data.drop_duplicates()

In [37]:
rf = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [38]:
afeatures = ['Age', 'BusinessTravel', 'Department', 'DistanceFromHome',
       'Education', 'EducationField','Gender','JobLevel', 'JobRole', 'MaritalStatus', 'MonthlyIncome',
       'NumCompaniesWorked','PercentSalaryHike','StockOptionLevel', 'TotalWorkingYears', 'TrainingTimesLastYear',
       'YearsAtCompany', 'YearsSinceLastPromotion', 'YearsWithCurrManager']

In [39]:
rf_model.fit(X=data[afeatures], y=data["Attrition"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [41]:
print("OOB Score : ",rf_model.oob_score_)

OOB Score :  0.9997732426303855


In [42]:
for feature,imp in zip(afeatures, rf_model.feature_importances_):
    print(feature,"\t",imp);

Age 	 0.09765628725729047
BusinessTravel 	 0.02856735948538756
Department 	 0.026019674081638158
DistanceFromHome 	 0.06969588877835496
Education 	 0.040705540190379634
EducationField 	 0.04155401538772353
Gender 	 0.018426597033548472
JobLevel 	 0.03780312258153053
JobRole 	 0.05521180239511245
MaritalStatus 	 0.039353707511877487
MonthlyIncome 	 0.09496384511962325
NumCompaniesWorked 	 0.055560370823467556
PercentSalaryHike 	 0.06584688692180651
StockOptionLevel 	 0.03390766947946431
TotalWorkingYears 	 0.08550843454972337
TrainingTimesLastYear 	 0.044196544250991385
YearsAtCompany 	 0.0685084313488622
YearsSinceLastPromotion 	 0.04278326685975767
YearsWithCurrManager 	 0.05373055594346083


## Age, Monthly Income, TotalWorkingYears

In [46]:
attr_model = tree.DecisionTreeClassifier(max_depth=6,max_leaf_nodes=10)

In [47]:
feat = pd.DataFrame([data['Age'],data['MonthlyIncome'],data['TotalWorkingYears']]).T
attr_model.fit(X=feat, y=data["Attrition"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [49]:
with open("attr_tree.dot",'w') as  f:
    f=tree.export_graphviz(attr_model,feature_names=['Age','MonthlyIncome','TotalWorkingYears'],out_file=f);

## if TotalWorkingYears is < 1.5 and monthly income is between 23000 to 110000 then the chance of attrition is high
## if TotalWorkingYears is > 1.5 , Age is < 33 then rate of attrition is very low

# Bank Loan Modelling (3)

In [51]:
dataset = pd.read_excel("Bank_Personal_Loan_Modelling.xlsx",sheet_name=1)

In [52]:
bankr_model = RandomForestClassifier(n_estimators=1000, max_features=2, oob_score=True)

In [53]:
dataset.columns

Index(['ID', 'Age', 'Experience', 'Income', 'ZIP Code', 'Family', 'CCAvg',
       'Education', 'Mortgage', 'Personal Loan', 'Securities Account',
       'CD Account', 'Online', 'CreditCard'],
      dtype='object')

In [54]:
dataset1 = dataset.drop(['ID','ZIP Code'], axis =1)
dataset2 = dataset1.dropna()
dataset3 = dataset2.drop_duplicates()
dataset3.head()

Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,1.6,1,0,0,1,0,0,0
1,45,19,34,3,1.5,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,2.7,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [55]:
dataset3['CCAvg'] = np.round(dataset3['CCAvg'])
dataset3.head()

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


Unnamed: 0,Age,Experience,Income,Family,CCAvg,Education,Mortgage,Personal Loan,Securities Account,CD Account,Online,CreditCard
0,25,1,49,4,2.0,1,0,0,1,0,0,0
1,45,19,34,3,2.0,1,0,0,1,0,0,0
2,39,15,11,1,1.0,1,0,0,0,0,0,0
3,35,9,100,1,3.0,2,0,0,0,0,0,0
4,35,8,45,4,1.0,2,0,0,0,0,0,1


In [56]:
bfeature = ['Age', 'Experience', 'Income','Family', 'CCAvg',
       'Education', 'Mortgage', 'Securities Account',
       'CD Account', 'Online', 'CreditCard']

In [59]:
bankr_model.fit(X=dataset[bfeature], y=dataset["Personal Loan"])

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
            max_depth=None, max_features=2, max_leaf_nodes=None,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, n_estimators=1000, n_jobs=1,
            oob_score=True, random_state=None, verbose=0, warm_start=False)

In [60]:
print(bankr_model.oob_score_)

0.9884


In [62]:
for feature,imp in zip(bfeature, bankr_model.feature_importances_):
    print(feature,"\t",imp)

Age 	 0.0447445021524499
Experience 	 0.04374677295564817
Income 	 0.349587454667695
Family 	 0.09352982273577225
CCAvg 	 0.18520428465385327
Education 	 0.1605790944592125
Mortgage 	 0.044421638058506865
Securities Account 	 0.005385825252769102
CD Account 	 0.0544997384124734
Online 	 0.008407699960072887
CreditCard 	 0.009893166691546851


In [75]:
bank_model = tree.DecisionTreeClassifier(max_depth=6, max_leaf_nodes=10)
predictors = pd.DataFrame([dataset3["Income"],dataset3["CCAvg"],dataset3["Education"]]).T

bank_model.fit(X=dataset3[bfeature], y=dataset3["Personal Loan"])

DecisionTreeClassifier(class_weight=None, criterion='gini', max_depth=6,
            max_features=None, max_leaf_nodes=10,
            min_impurity_decrease=0.0, min_impurity_split=None,
            min_samples_leaf=1, min_samples_split=2,
            min_weight_fraction_leaf=0.0, presort=False, random_state=None,
            splitter='best')

In [77]:
with open("Dtree.dot",'w') as f:
    f = tree.export_graphviz(bank_model,feature_names=['Education','CCAvg','Income'],out_file=f);

ValueError: Length of feature_names, 3 does not match number of features, 11

## When income is less than 100 dollar probability of getting loan is less