<a href="https://colab.research.google.com/github/dustinhodges/DS-Unit-2-Applied-Modeling/blob/master/Chile.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [45]:
import pandas as pd

chile = pd.read_csv('https://raw.githubusercontent.com/dustinhodges/DS-Unit-2-Applied-Modeling/master/Chile.csv')
print(chile.shape)

chile.head()

(2700, 9)


Unnamed: 0.1,Unnamed: 0,region,population,sex,age,education,income,statusquo,vote
0,1,N,175000,M,65.0,P,35000.0,1.0082,Y
1,2,N,175000,M,29.0,PS,7500.0,-1.29617,N
2,3,N,175000,F,38.0,P,15000.0,1.23072,Y
3,4,N,175000,F,49.0,P,35000.0,-1.03163,N
4,5,N,175000,F,23.0,S,35000.0,-1.10496,N


In [46]:
chile['vote'].value_counts()

N    889
Y    868
U    588
A    187
Name: vote, dtype: int64

In [47]:
pino = chile.copy()
pino.vote = pino.vote.replace({"N": "contra-Pinochet",
                               "Y": "YUA",
                               "U": "YUA",
                               "A": "YUA"}
                              ) 

pino['vote'].value_counts()

YUA                1643
contra-Pinochet     889
Name: vote, dtype: int64

In [48]:
#baseline gives 65% accuracy
#we will use vote as target (contra-pinochet 35%, not 65%)
#binary classification
#we can use accuracy

total = 1643 + 889
yua = 1643
yua / total

0.6488941548183255

In [49]:
total

2532

In [0]:
pino = pino.dropna(subset = ['vote'])

In [51]:
import numpy as np
from sklearn.model_selection import train_test_split

train = pino.copy()
train, test = train_test_split(train, train_size=0.80, test_size=0.20,
                               stratify=train['vote'], random_state=42)
train, val = train_test_split(train, train_size=0.80, test_size=0.20,
                             stratify=train['vote'], random_state=42)

train.shape, val.shape, test.shape

((1620, 9), (405, 9), (507, 9))

In [52]:
def wrangle(X):
    
    X = X.copy()

    unusable_variance = ['Unnamed: 0', 'statusquo']
    X = X.drop(columns=unusable_variance)
    
    return X

train = wrangle(train)
val = wrangle(val)
test = wrangle(test)

train.shape, val.shape, test.shape

((1620, 7), (405, 7), (507, 7))

In [53]:
train['income'].value_counts()

15000.0     463
35000.0     461
7500.0      296
75000.0     156
2500.0       94
125000.0     55
200000.0     46
Name: income, dtype: int64

In [54]:
!pip install category_encoders==2.*
from sklearn.linear_model import LogisticRegression
import category_encoders as ce 
from sklearn.impute import SimpleImputer
from sklearn.linear_model import LogisticRegressionCV
from sklearn.preprocessing import StandardScaler



In [0]:
#'population', 'region', 'age', , 'income', 'sex', 'education'

features = ['age', 'population', 'sex', 'education', 'income', 'region']
target = 'vote'
X_train = train[features]
y_train = train[target]
X_val = val[features]
y_val = val[target]
X_test = test[features]
y_test = test[target]

In [55]:
encoder = ce.OneHotEncoder(use_cat_names=True)
X_train_encoded = encoder.fit_transform(X_train)
X_val_encoded = encoder.transform(X_val)
X_test_encoded = encoder.transform(X_test)

imputer = SimpleImputer()
X_train_imputed = imputer.fit_transform(X_train_encoded)
X_val_imputed = imputer.transform(X_val_encoded)
X_test_imputed = imputer.transform(X_test_encoded)

scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train_imputed)
X_val_scaled = scaler.transform(X_val_imputed)
X_test_scaled = scaler.transform(X_test_imputed)

model = LogisticRegression()
model.fit(X_train_scaled, y_train)

print('Train Accuracy', model.score(X_train_scaled, y_train))
print('Validation Accuracy', model.score(X_val_scaled, y_val))

Train Accuracy 0.667283950617284
Validation Accuracy 0.691358024691358






In [66]:
X_train_encoded.head()

Unnamed: 0,age,population,sex_M,sex_F,education_S,education_P,education_PS,education_nan,income,region_S,region_SA,region_C,region_N,region_M
1222,41.0,45000,1,0,1,0,0,0,35000.0,1,0,0,0,0
2586,70.0,250000,1,0,1,0,0,0,35000.0,0,1,0,0,0
2390,36.0,250000,1,0,1,0,0,0,15000.0,0,1,0,0,0
893,50.0,25000,1,0,0,1,0,0,7500.0,0,0,1,0,0
2180,36.0,250000,0,1,0,1,0,0,35000.0,0,1,0,0,0


In [56]:
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import make_pipeline

forest = make_pipeline(
    ce.OrdinalEncoder(), 
    SimpleImputer(strategy='median'), 
    RandomForestClassifier(n_estimators=20, max_depth=4, random_state=42, n_jobs=-1)
)

forest.fit(X_train, y_train)

print('Train Accuracy', forest.score(X_train, y_train))
print('Validation Accuracy', forest.score(X_val, y_val))

Train Accuracy 0.678395061728395
Validation Accuracy 0.6691358024691358


In [57]:
from sklearn.pipeline import make_pipeline
from sklearn.tree import DecisionTreeClassifier

tree = make_pipeline(
    ce.OrdinalEncoder(),
    SimpleImputer(strategy='median'),
    DecisionTreeClassifier(max_depth=8, min_samples_split=8, min_samples_leaf=5, random_state=21)
)

tree.fit(X_train, y_train)

print('Train Accuracy', tree.score(X_train, y_train))
print('Validation Accuracy', tree.score(X_val, y_val))

Train Accuracy 0.7358024691358025
Validation Accuracy 0.6592592592592592


In [58]:
print('Tree Test Accuracy', tree.score(X_test, y_test))
print('Forest Test Accuracy', forest.score(X_test, y_test))
print('Linear Model Test Accuracy', model.score(X_test_scaled, y_test))

Tree Test Accuracy 0.6252465483234714
Forest Test Accuracy 0.6587771203155819
Linear Model Test Accuracy 0.6706114398422091


In [59]:
train.head()

Unnamed: 0,region,population,sex,age,education,income,vote
1222,S,45000,M,41.0,S,35000.0,contra-Pinochet
2586,SA,250000,M,70.0,S,35000.0,contra-Pinochet
2390,SA,250000,M,36.0,S,15000.0,YUA
893,C,25000,M,50.0,P,7500.0,YUA
2180,SA,250000,F,36.0,P,35000.0,contra-Pinochet


In [60]:
import plotly.express as px
df = train
fig = px.scatter(df, x="population", y="age", color="vote",
                 marginal_y="histogram", marginal_x="box", trendline="ols")
fig.show()


Method .ptp is deprecated and will be removed in a future version. Use numpy.ptp instead.



In [61]:
fig = px.histogram(df, x="education", color="vote")
fig.show()