In [1]:
import numpy as np
import pandas as pd
import statsmodels.api as sm

#Modelos de regresion y clasificacion
from sklearn.linear_model import LinearRegression
from sklearn.linear_model import LogisticRegression
from sklearn.neighbors import KNeighborsRegressor
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeRegressor
from sklearn.tree import DecisionTreeClassifier

from sklearn.preprocessing import MinMaxScaler

from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn import metrics

import faraway.datasets.divusa as divusa


In [2]:
data = divusa.load()
data.head()

Unnamed: 0,year,divorce,unemployed,femlab,marriage,birth,military
0,1920,8.0,5.2,22.7,92.0,117.9,3.2247
1,1921,7.2,11.7,22.79,83.0,119.8,3.5614
2,1922,6.6,6.7,22.88,79.7,111.2,2.4553
3,1923,7.1,2.4,22.97,85.2,110.5,2.2065
4,1924,7.2,5.0,23.06,80.3,110.9,2.2889


In [3]:
data.drop("year", axis=1, inplace=True)
data.head()

Unnamed: 0,divorce,unemployed,femlab,marriage,birth,military
0,8.0,5.2,22.7,92.0,117.9,3.2247
1,7.2,11.7,22.79,83.0,119.8,3.5614
2,6.6,6.7,22.88,79.7,111.2,2.4553
3,7.1,2.4,22.97,85.2,110.5,2.2065
4,7.2,5.0,23.06,80.3,110.9,2.2889


In [4]:
train, test = train_test_split(data, test_size=0.2, random_state=42)

In [5]:
train.head()

Unnamed: 0,divorce,unemployed,femlab,marriage,birth,military
9,8.0,3.2,23.51,75.5,89.3,2.0944
5,7.2,3.2,23.15,79.2,106.6,2.1735
34,9.5,5.5,34.6,79.8,118.1,20.3343
22,10.1,4.7,30.9,93.0,91.5,28.6133
30,10.3,5.3,33.9,90.2,106.2,9.627


In [6]:
mse_train = np.zeros(3)
mse_test = np.zeros(3)
X = train.iloc[:, 1:]
y = train.iloc[:, 0]

In [7]:
X

Unnamed: 0,unemployed,femlab,marriage,birth,military
9,3.2,23.51,75.5,89.3,2.0944
5,3.2,23.15,79.2,106.6,2.1735
34,5.5,34.60,79.8,118.1,20.3343
22,4.7,30.90,93.0,91.5,28.6133
30,5.3,33.90,90.2,106.2,9.6270
...,...,...,...,...,...
20,14.6,27.90,82.8,79.9,3.4693
60,7.1,51.50,61.4,68.4,9.0247
71,6.8,57.40,54.2,69.6,7.8744
14,21.7,25.32,71.8,78.5,1.9539


In [8]:
y

9      8.0
5      7.2
34     9.5
22    10.1
30    10.3
      ... 
20     8.8
60    22.6
71    20.9
14     7.5
51    15.8
Name: divorce, Length: 61, dtype: float64

# Linear Regression

In [9]:
lm_model = LinearRegression(fit_intercept=True)
lm_model.fit(X, y)

In [10]:
y_pred_train = lm_model.predict(X)
y_pred_test = lm_model.predict(test.iloc[:, 1:])

In [11]:
mse_train[0] = metrics.mean_squared_error(y, y_pred_train)
mse_test[0] = metrics.mean_squared_error(test.iloc[:,0], y_pred_test)

In [12]:
print(mse_train)
print(mse_test)

[2.53247962 0.         0.        ]
[2.59041398 0.         0.        ]


# KNN

In [13]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(test.iloc[:,1:])

In [14]:
knn_model = KNeighborsRegressor(n_neighbors=10, metric='euclidean')
knn_model.fit(X_scaled, y)

In [15]:
y_pred_train = knn_model.predict(X_scaled)
y_pred_test = knn_model.predict(X_test_scaled)

In [16]:
mse_train[1] = metrics.mean_squared_error(y, y_pred_train)
mse_test[1] = metrics.mean_squared_error(test.iloc[:, 0], y_pred_test)

In [17]:
print(mse_train)
print(mse_test)

[2.53247962 1.67057541 0.        ]
[2.59041398 0.56124375 0.        ]


# DT

In [18]:
dt_model = DecisionTreeRegressor(max_depth=4, min_samples_split=20, random_state=123)
dt_model.fit(X, y)

In [19]:
y_pred_train = dt_model.predict(X)
y_pred_test = dt_model.predict(test.iloc[:,1:])

In [20]:
mse_train[2] = metrics.mean_squared_error(y, y_pred_train)
mse_test[2] = metrics.mean_squared_error(test.iloc[:,0], y_pred_test)

In [21]:
print(mse_train)
print(mse_test)

[2.53247962 1.67057541 0.91575339]
[2.59041398 0.56124375 0.87215844]


# Cross Validation

In [22]:
mse_cv_mean = np.zeros(3)
mse_cv_std = np.zeros(3)

In [23]:
model = LinearRegression(fit_intercept=True)
scores =cross_val_score(model, X, y, cv=10, scoring=metrics.make_scorer(metrics.mean_squared_error))
mse_cv_mean[0] = scores.mean()
mse_cv_std[0] = scores.std()

In [24]:
print(scores)

[1.48910859 1.29539762 4.22665016 2.46952385 1.33811311 4.53966091
 7.43667788 2.27687076 4.18405806 2.67282593]


In [25]:
print(mse_cv_mean)
print(mse_cv_std)

[3.19288869 0.         0.        ]
[1.82896047 0.         0.        ]


# KNN

In [26]:
scaler = MinMaxScaler()
X_scaled = scaler.fit_transform(X)
X_test_scaled = scaler.transform(test.iloc[:, 1:])

In [27]:
k_cv = None
k_cv = np.zeros(15)
for k in range(1, 16):
    model = KNeighborsRegressor(n_neighbors=k, metric='euclidean')
    scores = cross_val_score(model, X_scaled, y, scoring= metrics.make_scorer(metrics.mean_squared_error), cv=10)
    k_cv[k-1] = scores.mean()
K = k_cv.argmin(0)+1

In [28]:
print(k_cv)

[1.32919048 1.30166071 1.36928307 1.43854167 1.55781619 1.61025595
 1.83155539 2.00285231 2.12997354 2.196555   2.50577706 2.7096045
 3.09450535 3.29648469 3.5331946 ]


In [29]:
print(K)

2


In [30]:
model = KNeighborsRegressor(n_neighbors=K, metric='euclidean')
scores = cross_val_score(model, X_scaled, y, scoring=metrics.make_scorer(metrics.mean_absolute_error), cv=10)

In [31]:
mse_cv_mean[1] = scores.mean()
mse_cv_std[1] = scores.std()

In [32]:
print(mse_cv_mean)
print(mse_cv_std)

[3.19288869 0.63821429 0.        ]
[1.82896047 0.37273939 0.        ]


DT

In [33]:
model = DecisionTreeRegressor(max_depth=4, min_samples_split=20, random_state=123)
scores = cross_val_score(model, X, y, scoring=metrics.make_scorer(metrics.mean_squared_error), cv=10)

In [34]:
mse_cv_mean[2] = scores.mean()
mse_cv_std[2] = scores.std()

In [35]:
print(mse_cv_mean)
print(mse_cv_std)

[3.19288869 0.63821429 4.22615998]
[1.82896047 0.37273939 2.44496131]


# DT importance

In [36]:
model = DecisionTreeRegressor(max_depth=4, min_samples_split=20, random_state=123)
model.fit(X,y)

In [37]:
model.feature_importances_

array([0.        , 0.88676646, 0.04836582, 0.01268331, 0.05218442])

In [38]:
X.columns

Index(['unemployed', 'femlab', 'marriage', 'birth', 'military'], dtype='object')

In [39]:
pd.DataFrame({'feature': X.columns, 'importance': model.feature_importances_}).sort_values(by='importance', ascending=False)

Unnamed: 0,feature,importance
1,femlab,0.886766
4,military,0.052184
2,marriage,0.048366
3,birth,0.012683
0,unemployed,0.0
