<a href="https://colab.research.google.com/github/cnyakundi/financial_inclusion/blob/master/notebooks/Multivariate_Analysis_Discriminant_Analysis.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multivariate Analysis with Python: Linear Discriminant Analysis

## Linear Discriminant Analysis

### Analysis 

In [32]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns


In [33]:
# Step 1: Importing the libraries that we will need 
#
import numpy as np
import pandas as pd

In [34]:
# Step 2: Loading the Dataset
#
df = pd.read_csv("Financial Dataset - 1.csv")
df.head()

Unnamed: 0,country,year,uniqueid,Has a Bank account,Type of Location,Cell Phone Access,household_size,Respondent Age,gender_of_respondent,The relathip with head,marital_status,Level of Educuation,Type of Job
0,Kenya,2018,uniqueid_1,Yes,Rural,Yes,3.0,24.0,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,2018,uniqueid_2,No,Rural,No,5.0,70.0,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,2018,uniqueid_3,Yes,Urban,Yes,5.0,26.0,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,2018,uniqueid_4,No,Rural,Yes,5.0,34.0,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,2018,uniqueid_5,No,Urban,No,8.0,26.0,Male,Child,Single/Never Married,Primary education,Informally employed


In [35]:
# We can convert Cell Phone Access and gender of respondents into categories. This will reduce memory usage 

df['Has a Bank account'].fillna('0')


# df['Cell Phone Access']=df['Cell Phone Access'].astype('category')


df['gender_of_respondent']=df['gender_of_respondent'].astype('category')



#We convert country, unique_id, the relationship with head, marital status, level of education and type of job into strings 

df['year']=pd.to_datetime(df['year']) # Convert date to datetime 
df['country']=df['country'].astype('str')
df['uniqueid']=df['uniqueid'].astype('str')
df['uniqueid']=df['uniqueid'].astype('str')
df['The relathip with head']=df['The relathip with head'].astype('str')
df['marital_status']=df['marital_status'].astype('str')
df['Level of Educuation']=df['Level of Educuation'].astype('str')
df['Type of Job']=df['Type of Job'].astype('str')



# Replace yes and No with '1' amd '0' then cast into integers 

df.replace(to_replace=['Yes','No'],value=['1','0'], inplace=True)


df['Has a Bank account'].fillna('0', inplace=True)


df['Has a Bank account']=df['Has a Bank account'].astype('int')
df.replace(to_replace=['Yes','No'],value=['1','0'], inplace=True)

# Replace yes and No with '1' amd '0' then cast into integers

df['Cell Phone Access'].fillna('0', inplace=True)

df['Cell Phone Access']=df['Cell Phone Access'].astype('int')

# Converting Type of location into integers 

df.replace(to_replace=['Rural','Urban'],value=['0','1'], inplace=True)
df['Type of Location'].fillna('0', inplace=True)

df['Type of Location']=df['Type of Location'].astype('int')

# Dealing with the Missing Data
# df['Respondent Age']=df['Respondent Age'].fillna(df['Respondent Age'].mode())


# df.isnull().sum()


# We fill all these with 0 because many people dont have bank accounts, live in rural areas and dont have cell phone access

df['Has a Bank account'].fillna('0', inplace=True)

df['Type of Location'].fillna('0', inplace=True)

df['Cell Phone Access'].fillna('0', inplace=True)

df.head()

Unnamed: 0,country,year,uniqueid,Has a Bank account,Type of Location,Cell Phone Access,household_size,Respondent Age,gender_of_respondent,The relathip with head,marital_status,Level of Educuation,Type of Job
0,Kenya,1970-01-01 00:00:00.000002018,uniqueid_1,1,0,1,3.0,24.0,Female,Spouse,Married/Living together,Secondary education,Self employed
1,Kenya,1970-01-01 00:00:00.000002018,uniqueid_2,0,0,0,5.0,70.0,Female,Head of Household,Widowed,No formal education,Government Dependent
2,Kenya,1970-01-01 00:00:00.000002018,uniqueid_3,1,1,1,5.0,26.0,Male,Other relative,Single/Never Married,Vocational/Specialised training,Self employed
3,Kenya,1970-01-01 00:00:00.000002018,uniqueid_4,0,0,1,5.0,34.0,Female,Head of Household,Married/Living together,Primary education,Formally employed Private
4,Kenya,1970-01-01 00:00:00.000002018,uniqueid_5,0,1,0,8.0,26.0,Male,Child,Single/Never Married,Primary education,Informally employed


In [36]:
df.drop([
       'gender_of_respondent', 'The relathip with head', 'marital_status',
       'Level of Educuation', 'Type of Job','year', 'uniqueid'], axis =1, inplace=True)

In [37]:
# Once dataset is loaded into a pandas data frame object, the first step is to divide dataset 

X = df.drop('country', 1)
y = df['country']

df.head()

df.dropna(inplace=True)

In [38]:
# The following code divides data into training and test sets

from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

In [39]:

from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)

X_train

array([[-0.40127504, -0.79982136,  0.59242648,  0.13799761, -0.89726836],
       [-0.40127504,  1.25027918,  0.59242648, -0.29881664, -0.53480595],
       [-0.40127504, -0.79982136,  0.59242648, -1.17244514,         nan],
       ...,
       [-0.40127504,  1.25027918,  0.59242648,  0.13799761, -0.41398515],
       [-0.40127504, -0.79982136, -1.68797317,  1.01162611, -0.65562675],
       [-0.40127504, -0.79982136,  0.59242648,  0.13799761, -0.47439555]])

### Peforming LDA

In [40]:

from sklearn.discriminant_analysis import LinearDiscriminantAnalysis as LDA
lda = LDA(n_components=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
X_train = lda.fit_transform(X_train, y_train)
X_test = lda.transform(X_test)


ValueError: ignored

In [42]:


from sklearn.ensemble import RandomForestClassifier

classifier = RandomForestClassifier(max_depth=2, random_state=0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)

ValueError: ignored

In [41]:



from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score

cm = confusion_matrix(y_test, y_pred)
print(cm)
print('Accuracy' + str(accuracy_score(y_test, y_pred)))


NameError: ignored

### <font color="green">Challenges</font>

In [None]:
# Challenge 1
# ---
# Question: Perform linear discriminant analysis to predict the cellular localization sites of proteins
# Dataset url = https://www.kaggle.com/imnikhilanand/heart-attack-analysis/data
# ---
# 
df = pd.read_csv('https://www.kaggle.com/imnikhilanand/heart-attack-analysis/data')
df.head()

In [None]:
# Challenge 2
# ---
# Question: Using the breast cancer wisconsin (diagnostic) dataset perform linear discriminant analysis
# Dataset url = http://bit.ly/BreastCancerDataset
# --
#
OUR CODE GOES HERE