## Hunton - Original Analysis Case Study - Part 5
### World Risk Poll

The findings of the World Risk Poll are based on over 150,000 interviews by Gallup in 142 countries. More information can be found on their website here: https://wrp.lrfoundation.org.uk/

Load packages and prepare data (from Part 1)

In [1]:
# Load libraries
import pandas as pd
from sklearn import linear_model
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.ensemble import RandomForestClassifier

# Read in data
df = pd.read_csv('RiskData.csv', na_values='#NULL!')

df.shape

(154195, 90)

Since previous EDA has shown that none of the respondents' answers to the questions have any bearing on whether people feel more or less safe, let's look at just the demographic information.

In [2]:
df = df[['GlobalRegion','CountryIncomeLevel','Age','Gender','Education','IncomeFeelings','INCOME_5','Urbanicity',
        'HouseholdSize','L2']]
    
# Get rid of rows with missing or invalid values
df.drop(index=df[df.isin([float(98), float(99)]).any(1)].index, inplace=True)
df = df[df['Age'] <= 97]
df = df[df['Education'] <= 3]
df = df[df['IncomeFeelings'] <= 4] 
df = df[df['Urbanicity'] <= 2] 
df = df.dropna()

df.shape

(144952, 10)

Scale demographic variables.

In [3]:
# Replace values in order to scale properly
yes_no = ['Urbanicity']
income_4_up = ['CountryIncomeLevel']
income_4_down =  ['IncomeFeelings']
income_5 = ['INCOME_5']

# Impose scale of Rural = 0 and City = 1 to Urbanicity
for col in yes_no:
    df[col] = df[col].replace(1,0)
    df[col] = df[col].replace(2,1)  
    
# Impose scale of Lowest Income = 0, Highest Income = 1, values in between
for col in income_4_up:
    df[col] = df[col].replace(1,0)
    df[col] = df[col].replace(2,0.33)
    df[col] = df[col].replace(3,0.66)
    df[col] = df[col].replace(4,1)
for col in income_4_down:
    df[col] = df[col].replace(2,0.66)
    df[col] = df[col].replace(3,0.33)
    df[col] = df[col].replace(4,0)
for col in income_5:
    df[col] = df[col].replace(1,0)
    df[col] = df[col].replace(2,0.25)
    df[col] = df[col].replace(3,0.5)
    df[col] = df[col].replace(4,0.75)
    df[col] = df[col].replace(5,1)
    
# Impose scale of Same = 0, Less = -1, and More = 1 to Question 2 (how safe do you feel?)
# This is the dependent variable
df['L2'] = df['L2'].replace(2,-1)
df['L2'] = df['L2'].replace(3,0)

# Review data
df.head()

Unnamed: 0,GlobalRegion,CountryIncomeLevel,Age,Gender,Education,IncomeFeelings,INCOME_5,Urbanicity,HouseholdSize,L2
0,11.0,0.33,58.0,2.0,2.0,0.33,0.75,1.0,3.0,-1.0
2,7.0,0.0,36.0,1.0,3.0,0.0,1.0,1.0,3.0,0.0
3,13.0,1.0,45.0,2.0,3.0,1.0,0.5,0.0,2.0,0.0
4,8.0,1.0,29.0,2.0,3.0,0.66,1.0,1.0,1.0,-1.0
5,8.0,0.66,35.0,2.0,1.0,0.66,0.5,0.0,2.0,1.0


Let's look at the income variables separate from the other information.

In [14]:
# Create 2 additional sets of data:
#   1. Income only data
#   2. Non-income data

all_df = df.drop(columns=['L2'])
all_target = df['L2']
income_df = df[{'CountryIncomeLevel', 'IncomeFeelings', 'INCOME_5'}]
income_target = df['L2']
non_inc_df = df.drop(columns=['CountryIncomeLevel', 'IncomeFeelings', 'INCOME_5','L2'])
non_inc_target = df['L2']

# Separate all three sets of data into training and testing data
all_X_train, all_X_test, all_y_train, all_y_test = train_test_split(all_df, all_target, test_size =0.3, random_state=11)
income_X_train, income_X_test, income_y_train, income_y_test = train_test_split(income_df, income_target, 
                                                                                test_size =0.3, random_state=11)
non_inc_X_train, non_inc_X_test, non_inc_y_train, non_inc_y_test = train_test_split(non_inc_df, non_inc_target, 
                                                                                    test_size =0.3, random_state=11)

# Create function to calculate accuracy of predictions
def accuracy(cm):
   tp = cm.trace()
   all = cm.sum()
   return tp / all

print('Training data size: ',len(all_X_train))
print('Testing data size: ', len(all_X_test))

Training data size:  101466
Testing data size:  43486


Neural Net

In [6]:
# Define the classifier
#classifier = MLPClassifier(hidden_layer_sizes=(150,100,50), max_iter=300,activation = 'relu',solver='adam',random_state=1)
#mlp = MLPClassifier(hidden_layer_sizes=(10),solver='sgd',learning_rate_init=0.01,max_iter=500)

income_results = {}
non_inc_results = {}
all_results = {}

for b in batch:
    classifier = MLPClassifier(hidden_layer_sizes=(150,100,50), solver='adam', max_iter=500, learning_rate_init=0.0001, 
                               learning_rate='adaptive', batch_size = 3, random_state=1)

    #Fit the training data to the network, make prediction and check accuracy for income data
    classifier.fit(income_X_train, income_y_train)
    income_y_pred = classifier.predict(income_X_test)
    cm = confusion_matrix(income_y_pred, income_y_test)
    ac = accuracy(cm)
    income_results.update({b:ac})
    
    #Fit the training data to the network, make prediction and check accuracy for non-income data
    classifier.fit(non_inc_X_train, non_inc_y_train)
    non_inc_y_pred = classifier.predict(non_inc_X_test)
    cm = confusion_matrix(non_inc_y_pred, non_inc_y_test)
    ac = accuracy(cm)
    non_inc_results.update({b:ac})

    #Fit the training data to the network, make prediction and check accuracy for all data
    classifier.fit(all_X_train, all_y_train)
    all_y_pred = classifier.predict(all_X_test)
    cm = confusion_matrix(all_y_pred, all_y_test)
    ac = accuracy(cm)
    all_results.update({b:ac})
    
print('Accuracy of MLPClassifier on Income : ', income_results)
print('Accuracy of MLPClassifier on Non-Income : ', non_inc_results)
print('Accuracy of MLPClassifier overall : ', all_results)

Accuracy of MLPClassifier on Income :  {3: 0.46368946327553695}
Accuracy of MLPClassifier on Non-Income :  {3: 0.4810743687623603}
Accuracy of MLPClassifier overall :  {3: 0.5079335878213678}


Logistic Regression (showing best results after optimizing for # iterations).

In [18]:
# Set up Logistic Regression model
classifier = linear_model.LogisticRegression(multi_class='multinomial', solver='lbfgs', max_iter = 500)
    
#Fit the training data to the regression, make prediction and check accuracy for income data
classifier.fit(income_X_train, income_y_train)
income_y_pred = classifier.predict(income_X_test)
income_cm = confusion_matrix(income_y_pred, income_y_test)
    
#Fit the training data to the regression, make prediction and check accuracy for non-income data
classifier.fit(non_inc_X_train, non_inc_y_train)
non_inc_y_pred = classifier.predict(non_inc_X_test)
non_inc_cm = confusion_matrix(non_inc_y_pred, non_inc_y_test)
    
#Fit the training data to the regression, make prediction and check accuracy for all data
classifier.fit(all_X_train, all_y_train)
all_y_pred = classifier.predict(all_X_test)
all_cm = confusion_matrix(all_y_pred, all_y_test)
    
print('Accuracy of logistic regression on Income : ', accuracy(income_cm))
print('Accuracy of logistic regression on Non-Income : ', accuracy(non_inc_cm))
print('Accuracy of logistic regression overall : ', accuracy(all_cm))

Accuracy of logistic regression on Income :  0.4624246884054638
Accuracy of logistic regression on Non-Income :  0.46955341949133056
Accuracy of logistic regression overall :  0.4767281423906545


K-Nearest Neighbors (showing best results after optimizing for value of k).

In [20]:
# Set up model
classifier = KNeighborsClassifier(n_neighbors=9)
    
#Fit the training data to the knn classifier, make prediction and check accuracy for income data
classifier.fit(income_X_train, income_y_train)
income_y_pred = classifier.predict(income_X_test)
cm = confusion_matrix(income_y_pred, income_y_test)
income_results = accuracy(cm)
    
#Fit the training data to the knn classifier, make prediction and check accuracy for non-income data
classifier.fit(non_inc_X_train, non_inc_y_train)
non_inc_y_pred = classifier.predict(non_inc_X_test)
cm = confusion_matrix(non_inc_y_pred, non_inc_y_test)
non_inc_results = accuracy(cm)
    
#Fit the training data to the knn classifier, make prediction and check accuracy for all data
classifier.fit(all_X_train, all_y_train)
all_y_pred = classifier.predict(all_X_test)
cm = confusion_matrix(all_y_pred, all_y_test)
all_results = accuracy(cm)
    
print('Accuracy of knn on Income : ', income_results)
print('Accuracy of knn on Non-Income : ', non_inc_results)
print('Accuracy of knn overall : ', all_results)

Accuracy of knn on Income :  0.4164328749482592
Accuracy of knn on Non-Income :  0.44398197120912475
Accuracy of knn overall :  0.45161661224302074


Random Forest

In [21]:
# Set up Random Forest
classifier = RandomForestClassifier(n_estimators=100)
    
#Fit the training data to the knn classifier, make prediction and check accuracy for income data
classifier.fit(income_X_train, income_y_train)
income_y_pred = classifier.predict(income_X_test)
cm = confusion_matrix(income_y_pred, income_y_test)
income_results = accuracy(cm)
    
#Fit the training data to the knn classifier, make prediction and check accuracy for non-income data
classifier.fit(non_inc_X_train, non_inc_y_train)
non_inc_y_pred = classifier.predict(non_inc_X_test)
cm = confusion_matrix(non_inc_y_pred, non_inc_y_test)
non_inc_results = accuracy(cm)
    
#Fit the training data to the knn classifier, make prediction and check accuracy for all data
classifier.fit(all_X_train, all_y_train)
all_y_pred = classifier.predict(all_X_test)
cm = confusion_matrix(all_y_pred, all_y_test)
all_results = accuracy(cm)
    
print('Accuracy of random forest on Income : ', income_results)
print('Accuracy of random forest on Non-Income : ', non_inc_results)
print('Accuracy of random forest overall : ', all_results)

Accuracy of random forest on Income :  0.46368946327553695
Accuracy of random forest on Non-Income :  0.4345306535436692
Accuracy of random forest overall :  0.44002667525180517
