In [1]:
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
import copy
import joblib

In [2]:
from py.utils import verifyDir

In [3]:
DATA_PATH = "data/PlacePulse_2/"

QSCORE_PATH = f"{DATA_PATH}/Qscores/all/Summaries/"
MAIN_OUT_PATH = "outputs/"
CACHE_PATH = f"{MAIN_OUT_PATH}/cache/"

In [4]:
verifyDir(CACHE_PATH)

### Generating samples 

In [5]:
data_dict = joblib.load(f"{MAIN_OUT_PATH}/static/data_model.joblib")
data_dict.keys()

dict_keys(['id', 'features', 'features_name', 'safety', 'label', 'latitude', 'longitude', 'city', 'country', 'continent', 'path'])

In [6]:
y = np.array(data_dict["safety"]).copy()
X = np.array(data_dict["features"]).copy()
X.shape, y.shape

((110988, 19), (110988,))

### Standard Scaler

In [7]:
from sklearn.preprocessing import StandardScaler

In [8]:
scaler = StandardScaler()

In [9]:
from sklearn.preprocessing import Normalizer

In [10]:
normalizer = Normalizer()

### Classifiers

In [11]:
from sklearn.svm import SVR, LinearSVR
from sklearn.linear_model import Ridge, LinearRegression, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor

In [12]:
regularizer_value = np.logspace(-3, 3, num=7)
type_scaler = ['passthrough', scaler, normalizer]
# dim_reductor = [t_svd, pca, sparse_pca]
random_state = 42
max_iter=2000

In [13]:
lr = LinearRegression()

param_lr = {}
param_lr['scaler'] = copy.deepcopy(type_scaler)
#param_lr['dim_reductor'] = copy.deepcopy(dim_reductor)
param_lr['regressor'] = [lr]

In [14]:
lasso = Lasso(
    tol=1e-3,
    random_state=random_state,
    max_iter=max_iter,
)

param_lasso = {}
param_lasso['scaler'] = copy.deepcopy(type_scaler)
# param_ridge['dim_reductor'] = copy.deepcopy(dim_reductor)
param_lasso['regressor__alpha'] = copy.deepcopy(regularizer_value)
param_lasso['regressor'] = [lasso]

In [15]:
ridge = Ridge(
    tol=1e-3,
    random_state=random_state,
    max_iter=max_iter,
)

param_ridge = {}
param_ridge['scaler'] = copy.deepcopy(type_scaler)
# param_ridge['dim_reductor'] = copy.deepcopy(dim_reductor)
param_ridge['regressor__alpha'] = copy.deepcopy(regularizer_value)
param_ridge['regressor'] = [ridge]

In [16]:
linear_svm = LinearSVR(
    tol=1e-3,
    random_state=random_state,
    max_iter=max_iter,
)

param_linear_svm = {}
param_linear_svm['scaler'] = copy.deepcopy(type_scaler)
# param_linear_svm['dim_reductor'] = copy.deepcopy(dim_reductor)
param_linear_svm['regressor__C'] = copy.deepcopy(regularizer_value)
param_linear_svm['regressor__loss'] = ["epsilon_insensitive", "squared_epsilon_insensitive"]
param_linear_svm['regressor'] = [linear_svm]

In [17]:
svm = SVR(
    tol=1e-3,
    max_iter=max_iter,
)

param_svm = {}
param_svm['scaler'] = copy.deepcopy(type_scaler)
# param_linear_svm['dim_reductor'] = copy.deepcopy(dim_reductor)
param_svm['regressor__C'] = copy.deepcopy(regularizer_value)
param_svm['regressor__gamma'] = ["scale", "auto"]
param_svm['regressor__kernel'] = ["linear", "poly", "rbf"]
param_svm['regressor'] = [svm]

In [18]:
ds_tree = DecisionTreeRegressor(random_state=random_state)

param_tree = {}
param_tree["scaler"] = copy.deepcopy(type_scaler)
param_tree['regressor__max_features'] = [None, 'sqrt']
param_tree['regressor__max_depth'] = np.append(None, np.arange(10, 110, 10) )
param_tree['regressor__min_samples_split'] = np.arange(3, 7)
param_tree['regressor__min_samples_leaf'] = np.arange(3, 7)
param_tree['regressor__criterion'] = ["squared_error", "friedman_mse", "absolute_error", "poisson"]
param_tree['regressor'] = [ds_tree]

### Create Pipeline

In [19]:
from joblib import Memory

In [20]:
memory = Memory(
    location=CACHE_PATH,
    # verbose=1,
)

In [21]:
from sklearn.pipeline import Pipeline

In [22]:
pipeline = Pipeline(
    steps = [
        # 1. Scaler
        ('scaler', scaler),
        # 3. Model selecction
        ('regressor', lr),
    ],
    memory=memory,
)

### Metrics

In [23]:
from sklearn.metrics import mean_absolute_error, mean_squared_error, r2_score, make_scorer
from scipy.stats import pearsonr

In [24]:
def adj_r2_score(estimator, X, y_true):
    n, p = X.shape
    pred = estimator.predict(X)
    return 1 - ((1 - r2_score(y_true, pred)) * (n - 1))/(n-p-1)

In [25]:
def pearson_score(y_true, y_pred):
    corr, _ = pearsonr(y_true, y_pred)
    return corr

### Create GridSearch

In [26]:
from sklearn.model_selection import GridSearchCV

In [27]:
param_grid = [param_lr, 
              param_lasso,
              param_ridge,
              param_linear_svm,
              param_svm,
              #param_tree,
             ]

In [28]:
param_grid

[{'scaler': ['passthrough', StandardScaler(), Normalizer()],
  'regressor': [LinearRegression()]},
 {'scaler': ['passthrough', StandardScaler(), Normalizer()],
  'regressor__alpha': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
  'regressor': [Lasso(max_iter=2000, random_state=42, tol=0.001)]},
 {'scaler': ['passthrough', StandardScaler(), Normalizer()],
  'regressor__alpha': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
  'regressor': [Ridge(max_iter=2000, random_state=42, tol=0.001)]},
 {'scaler': ['passthrough', StandardScaler(), Normalizer()],
  'regressor__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
  'regressor__loss': ['epsilon_insensitive', 'squared_epsilon_insensitive'],
  'regressor': [LinearSVR(max_iter=2000, random_state=42, tol=0.001)]},
 {'scaler': ['passthrough', StandardScaler(), Normalizer()],
  'regressor__C': array([1.e-03, 1.e-02, 1.e-01, 1.e+00, 1.e+01, 1.e+02, 1.e+03]),
  'regressor__gamma': ['scale', 

In [29]:
grid_search = GridSearchCV(
            estimator=pipeline,
            param_grid=param_grid,
            scoring=make_scorer(pearson_score),
            refit=True,
            cv=5,  # << Use time series
            verbose=4,
)

In [30]:
grid_search

### Data Split

In [31]:
from sklearn.metrics import classification_report
from sklearn.model_selection import train_test_split

In [32]:
xtrain, xtest, ytrain, ytest = train_test_split(X, 
                                                y, 
                                                train_size=0.75, 
                                                random_state=random_state,
                                               )
print(xtrain.shape, ytrain.shape, xtest.shape, ytest.shape)

(83241, 19) (83241,) (27747, 19) (27747,)


### Training

In [None]:
%%time
regressor = grid_search.fit(xtrain, ytrain)

Fitting 5 folds for each of 213 candidates, totalling 1065 fits
[CV 1/5] END regressor=LinearRegression(), scaler=passthrough;, score=0.279 total time=   0.0s
[CV 2/5] END regressor=LinearRegression(), scaler=passthrough;, score=0.278 total time=   0.0s
[CV 3/5] END regressor=LinearRegression(), scaler=passthrough;, score=0.292 total time=   0.0s
[CV 4/5] END regressor=LinearRegression(), scaler=passthrough;, score=0.283 total time=   0.0s
[CV 5/5] END regressor=LinearRegression(), scaler=passthrough;, score=0.283 total time=   0.0s
[CV 1/5] END regressor=LinearRegression(), scaler=StandardScaler();, score=0.279 total time=   0.0s
[CV 2/5] END regressor=LinearRegression(), scaler=StandardScaler();, score=0.278 total time=   0.0s
[CV 3/5] END regressor=LinearRegression(), scaler=StandardScaler();, score=0.292 total time=   0.0s
[CV 4/5] END regressor=LinearRegression(), scaler=StandardScaler();, score=0.283 total time=   0.0s
[CV 5/5] END regressor=LinearRegression(), scaler=StandardSca



[CV 5/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1.0, scaler=passthrough;, score=0.178 total time=   0.0s
[CV 1/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 2/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 3/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 4/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 5/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 1/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1.0, scaler=N



[CV 4/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 5/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 1/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=passthrough;, score=nan total time=   0.0s
[CV 2/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=passthrough;, score=nan total time=   0.0s
[CV 3/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=passthrough;, score=nan total time=   0.0s
[CV 4/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=passthrough;, score=nan total time=   0.0s
[CV 5/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=passthrough;, score=n



[CV 5/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 1/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 2/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 3/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 4/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 5/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 1/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=100.0, scaler=passthroug



[CV 2/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=100.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 3/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=100.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 4/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=100.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 5/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=100.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 1/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=100.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 2/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=100.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 3/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=100.0,



[CV 1/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=passthrough;, score=nan total time=   0.0s
[CV 2/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=passthrough;, score=nan total time=   0.0s
[CV 3/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=passthrough;, score=nan total time=   0.0s
[CV 4/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=passthrough;, score=nan total time=   0.0s
[CV 5/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=passthrough;, score=nan total time=   0.0s
[CV 1/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=StandardScaler();, score=nan total time=   0.0s
[CV 2/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=St



[CV 1/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 2/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 3/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 4/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 5/5] END regressor=Lasso(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=1000.0, scaler=Normalizer();, score=nan total time=   0.0s
[CV 1/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=0.001, scaler=passthrough;, score=0.279 total time=   0.0s
[CV 2/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=0.001, scaler=pa



[CV 4/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=0.001, scaler=StandardScaler();, score=0.283 total time=   0.0s
[CV 5/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=0.001, scaler=StandardScaler();, score=0.283 total time=   0.0s
[CV 1/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=0.001, scaler=Normalizer();, score=0.285 total time=   0.0s
[CV 2/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=0.001, scaler=Normalizer();, score=0.287 total time=   0.0s
[CV 3/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=0.001, scaler=Normalizer();, score=0.299 total time=   0.0s
[CV 4/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=0.001, scaler=Normalizer();, score=0.294 total time=   0.0s
[CV 5/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=0.

[CV 5/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=StandardScaler();, score=0.283 total time=   0.0s
[CV 1/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=Normalizer();, score=0.284 total time=   0.0s
[CV 2/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=Normalizer();, score=0.286 total time=   0.0s
[CV 3/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=Normalizer();, score=0.299 total time=   0.0s
[CV 4/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=Normalizer();, score=0.294 total time=   0.0s
[CV 5/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=10.0, scaler=Normalizer();, score=0.290 total time=   0.0s
[CV 1/5] END regressor=Ridge(max_iter=2000, random_state=42, tol=0.001), regressor__alpha=100.0, scale

[CV 1/5] END regressor=LinearSVR(max_iter=2000, random_state=42, tol=0.001), regressor__C=0.001, regressor__loss=squared_epsilon_insensitive, scaler=passthrough;, score=0.249 total time=   0.3s
[CV 2/5] END regressor=LinearSVR(max_iter=2000, random_state=42, tol=0.001), regressor__C=0.001, regressor__loss=squared_epsilon_insensitive, scaler=passthrough;, score=0.255 total time=   0.3s
[CV 3/5] END regressor=LinearSVR(max_iter=2000, random_state=42, tol=0.001), regressor__C=0.001, regressor__loss=squared_epsilon_insensitive, scaler=passthrough;, score=0.266 total time=   0.3s
[CV 4/5] END regressor=LinearSVR(max_iter=2000, random_state=42, tol=0.001), regressor__C=0.001, regressor__loss=squared_epsilon_insensitive, scaler=passthrough;, score=0.255 total time=   0.3s
[CV 5/5] END regressor=LinearSVR(max_iter=2000, random_state=42, tol=0.001), regressor__C=0.001, regressor__loss=squared_epsilon_insensitive, scaler=passthrough;, score=0.253 total time=   0.3s
[CV 1/5] END regressor=LinearS

### Results

In [None]:
pd.DataFrame(grid_search.cv_results_)

In [None]:
grid_search.best_estimator_

In [None]:
y_pred = regressor.predict(xtest)
y_pred.shape, pearsonr(ytest, y_pred)

### Saving data to server

In [None]:
data_server = joblib.load(f"{MAIN_OUT_PATH}/static/data_server.joblib")
data_server.keys()

In [None]:
y_pred = regressor.predict( np.array(data_server["features"]).copy() )
y_pred.shape

In [None]:
data_server["prediction"] = y_pred.tolist()

In [None]:
joblib.dump(data_dict, f"{MAIN_OUT_PATH}/static/data_server.joblib")

In [None]:
os.exit()

In [None]:
print(classification_report(
    ytest,
    grid_search.predict(xtest),
    digits=4,
))

### Feature Importance

In [None]:
importance_classes = data_df.columns[1:-3]
importance_classes.values

In [None]:
importance = grid_search.best_estimator_._final_estimator.coef_[0]

In [None]:
feat_importances = pd.Series(importance)
feat_importances = feat_importances.reset_index().rename(columns={"index":"class", 0:"importance"}).copy()
feat_importances["class"] = importance_classes.values
feat_importances.set_index("class", inplace=True)
feat_importances.sort_values(by="importance", inplace=True, ascending=False)
feat_importances

In [None]:
feat_importances.iloc[:20,:].plot(kind='barh',title = 'Feature Importance')

In [None]:
feat_importances.iloc[-20:,:].plot(kind='barh',title = 'Feature Importance')