<a href="https://colab.research.google.com/github/dustinhodges/DS-Unit-2-Linear-Models/blob/master/Chile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Load Dataset, some cleaning...



In [1]:
import pandas as pd

chile = pd.read_csv('https://raw.githubusercontent.com/dustinhodges/DS-Unit-2-Applied-Modeling/master/Chile.csv')
print(chile.shape)

chile.head()

(2700, 9)


Unnamed: 0.1,Unnamed: 0,region,population,sex,age,education,income,statusquo,vote
0,1,N,175000,M,65.0,P,35000.0,1.0082,Y
1,2,N,175000,M,29.0,PS,7500.0,-1.29617,N
2,3,N,175000,F,38.0,P,15000.0,1.23072,Y
3,4,N,175000,F,49.0,P,35000.0,-1.03163,N
4,5,N,175000,F,23.0,S,35000.0,-1.10496,N


In [2]:
chile['vote'].value_counts()

N    889
Y    868
U    588
A    187
Name: vote, dtype: int64

In [0]:
chile = chile.dropna(subset = ['vote'])
chile.education = chile.education.replace({"P": "1-primary",
                                         "S": "2-secondary",
                                         "PS": "3-post-secondary"})

In [4]:
chile.shape

(2532, 9)

# Explore the relationships between features and target("vote")


In [5]:
import plotly.express as px
df = chile
fig = px.box(df, y="population", color="vote")
fig.show()

In [6]:
fig = px.box(df, y="sex", color="vote")
fig.show()

In [7]:
fig = px.box(df, y="age", color="vote")
fig.show()

In [8]:
fig = px.box(df, y="income", color="vote")
fig.show()

In [9]:
fig = px.histogram(df, x="region", color="vote")
fig.show()

In [10]:
fig = px.box(df, y="education", color="vote")
fig.show()

# Streamline the target, get a baseline for accuracy, Train/Val/Test split, wrangle, and install category encoders.

In [11]:
pino = chile.copy()
pino.vote = pino.vote.replace({"N": "contra-Pinochet",
                               "Y": "pro/undecided/abstain",
                               "U": "pro/undecided/abstain",
                               "A": "pro/undecided/abstain"}
                              ) 

pino['vote'].value_counts()

pro/undecided/abstain    1643
contra-Pinochet           889
Name: vote, dtype: int64

In [12]:
#baseline gives 65% accuracy
#we will use vote as target
#binary classification
#we can use accuracy

total = 1643 + 889
pro = 1643
pro / total

0.6488941548183255

In [13]:
total

2532

In [14]:
import numpy as np
from sklearn.model_selection import train_test_split

train = pino.copy()
train, test = train_test_split(train, train_size=0.80, test_size=0.20,
                               stratify=train['vote'], random_state=42)
train, val = train_test_split(train, train_size=0.80, test_size=0.20,
                             stratify=train['vote'], random_state=42)

train.shape, val.shape, test.shape

((1620, 9), (405, 9), (507, 9))

In [15]:
#'statusquo' would cause data leakage

def wrangle(X):
    
    X = X.copy()

    unusable_variance = ['Unnamed: 0', 'statusquo']
    X = X.drop(columns=unusable_variance)
    
    return X

train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

train.shape, val.shape, test.shape

((1620, 7), (405, 7), (507, 7))

In [16]:
!pip install category_encoders==2.*
from sklearn.linear_model import LogisticRegression
import category_encoders as ce 
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler



# Logistic Regression, Tree, Forest models

In [0]:
#'population', 'region', 'age', 'income', 'sex', 'education'

features = ['age', 'education', 'population', 'region', 'income', 'sex']
target = 'vote'
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

In [18]:
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)
X_test_encoded = encoder.transform(X_test)

imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train_encoded)
X_val_imputed = imputer.transform(X_val_encoded)
X_test_imputed = imputer.transform(X_test_encoded)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

print('Train Accuracy', model.score(X_train_scaled, y_train))
print('Validation Accuracy', model.score(X_val_scaled, y_val))


Train Accuracy 0.6691358024691358
Validation Accuracy 0.6790123456790124






In [81]:
model.coef_

array([[ 0.2318057 , -0.01780145,  0.14513671, -0.16832188,  0.02204299,
        -0.30784115,  0.06310619,  0.00726741, -0.08995113, -0.0010264 ,
         0.03303832,  0.07210566,  0.15196968, -0.15196968]])

In [89]:
X_train_encoded.head()

Unnamed: 0,age,education_2-secondary,education_1-primary,education_3-post-secondary,education_nan,population,region_SA,region_N,region_C,region_S,region_M,income,sex_F,sex_M
1930,45.0,1,0,0,0,250000,1,0,0,0,0,15000.0,1,0
1834,44.0,0,1,0,0,250000,1,0,0,0,0,7500.0,1,0
94,21.0,0,0,1,0,125000,0,1,0,0,0,15000.0,0,1
766,49.0,0,1,0,0,175000,0,0,1,0,0,15000.0,0,1
2500,39.0,0,0,1,0,250000,1,0,0,0,0,200000.0,0,1


In [68]:
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

tree = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='median'),
    DecisionTreeClassifier(max_depth=6, min_samples_split=10, min_samples_leaf=8, random_state=21)
)

tree.fit(X_train, y_train)

print('Train Accuracy', tree.score(X_train, y_train))
print('Validation Accuracy', tree.score(X_val, y_val))

Train Accuracy 0.6981481481481482
Validation Accuracy 0.6790123456790124


In [78]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

forest = make_pipeline(
    ce.OneHotEncoder(), 
    SimpleImputer(strategy='median'), 
    RandomForestClassifier(n_estimators=100, max_depth=, random_state=42, n_jobs=-1)
)

forest.fit(X_train, y_train)

print('Train Accuracy', forest.score(X_train, y_train))
print('Validation Accuracy', forest.score(X_val, y_val))

Train Accuracy 0.7444444444444445
Validation Accuracy 0.671604938271605


# Visualize the features to understand why the Linear Model performed at 68% validation accuracy, better than the tree or forest model.

In [36]:
import plotly.express as px
df = train
fig = px.scatter(df, x="population", y="age", color="vote",
                 marginal_y="histogram", marginal_x="box", trendline='ols', width=1000)
fig.show()


Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.



In [35]:
fig = px.scatter(df, x="income", y="age", color="vote",
                 marginal_y="histogram", marginal_x="box", width=1000)
fig.show()

In [39]:
fig = px.histogram(df, x="education", color="vote", barmode="group", width=1000).update_xaxes(categoryorder="total descending")
fig.show()

In [38]:
#women support Pinochet in greater numbers

fig = px.histogram(df, x="sex", color="vote", barmode='group', width=1000).update_xaxes(categoryorder="total descending")
fig.show()

In [26]:
fig = px.histogram(df, x="region", color="vote", barmode="overlay",
                   opacity=0.6).update_xaxes(categoryorder="total descending")
fig.show()

In [43]:
fig = px.histogram(df, x="population", color="vote", barmode='group', width=900, hover_data=df.columns)
fig.show()

In [44]:
#the young vote against pinochet in greater proportion that other groups

fig = px.histogram(df, x="age", color="vote", barmode="overlay",
                   opacity=0.6, width=1000)
fig.show()

In [46]:
#lowest incomes see highest support for pinochet

fig = px.histogram(df, x="income", color="vote", barmode="group")
fig.show()

In [30]:
fig = px.box(df, y="age", color="vote")
fig.show()

# Test Accuracies

In [79]:
print('Tree Test Accuracy', tree.score(X_test, y_test))
print('Forest Test Accuracy', forest.score(X_test, y_test))
print('Linear Model Test Accuracy', model.score(X_test_scaled, y_test))

Tree Test Accuracy 0.6469428007889546
Forest Test Accuracy 0.6568047337278107
Linear Model Test Accuracy 0.6627218934911243
