<h1>Application of Apache Spark</h1>
<h3>with Feature Engineering</h3>

<h4>Focus on feature selection, processing speed, model iteration</h4>
<ul>
    <li>Will create different training sets w/ reature mixes</li>
    <li>PyTorch, Numba, Parralization, and Dask for processing speed</li>
    <li>Processing tutorials: 
        <a href = 'https://towardsdatascience.com/speed-up-your-algorithms-part-3-parallelization-4d95c0888748'>speeding up your algorithms</a>, 
        <a href = 'https://towardsdatascience.com/improving-random-forest-in-python-part-1-893916666cd'>improving random forest</a>
    </li>
    <li>Split train into 3 train/test splits, run models and compare results before running on final test</li>
</ul>

In [1]:
from importlib import reload #for changes in helpers
import time

import pandas as pd
import numpy as np
import scipy.stats

pd.set_option('display.max_columns', 500)
pd.set_option('display.width', 1000)

import matplotlib.pyplot as plt
import seaborn as sns


from sklearn import preprocessing
from sklearn.decomposition import PCA
from sklearn.cluster import KMeans
from sklearn.preprocessing import LabelEncoder

from sklearn.model_selection import train_test_split
from sklearn import linear_model 
from sklearn.ensemble import RandomForestRegressor
from sklearn.metrics import mean_squared_error

import helpers
reload(helpers)

sns.set()

In [2]:
train = pd.read_csv('../geotab-data/train.csv')
test = pd.read_csv('../geotab-data/test.csv')

<h1>Target PCA</h1>

<ul>
    <li>Light summary eda</li>
    <li>Min max scaler and target PCA</li>
    <li>Not sure if this is necessary for purpose of this notebook</li>
    </ul>

In [3]:
targets = train.iloc[:,12:27]

#targets to predict
total_time = targets[['TotalTimeStopped_p20','TotalTimeStopped_p50', 'TotalTimeStopped_p80']]
distance_to_first = targets[['DistanceToFirstStop_p20','DistanceToFirstStop_p50','DistanceToFirstStop_p80']]
target_cols = list(total_time.columns) + list(distance_to_first.columns)
p_targets = targets[target_cols]

#optional targets
time_from_first = targets[['TimeFromFirstStop_p20','TimeFromFirstStop_p50','TimeFromFirstStop_p80']]

print('Target Summaries:')
display(helpers.summarize(p_targets, True))

Target Summaries:


Unnamed: 0,name,dtypes,missing,unique,first_val,last_val,max,mean,median,stdev,entropy
0,TotalTimeStopped_p20,int64,0,172,0.0,0.0,273.0,1.731272,0.0,7.080017,0.92
1,TotalTimeStopped_p50,int64,0,264,0.0,0.0,343.0,7.681874,0.0,15.553418,2.7
2,TotalTimeStopped_p80,int64,0,403,0.0,0.0,689.0,22.948071,16.0,28.118134,5.06
3,DistanceToFirstStop_p20,float64,0,3479,0.0,0.0,1902.7,6.56445,0.0,28.003261,1.35
4,DistanceToFirstStop_p50,float64,0,7483,0.0,0.0,3099.5,28.255852,0.0,71.72009,4.16
5,DistanceToFirstStop_p80,float64,0,13267,0.0,0.0,4064.3,81.922639,60.4,152.68276,8.1


In [4]:
#scale targets w/ a min max scalers and append to train
for col in target_cols:
    train[col+str("_minmax")] = (preprocessing.minmax_scale(train[col], feature_range=(0,1)))
    
min_max_cols = ['TotalTimeStopped_p20_minmax', 'TotalTimeStopped_p50_minmax',
                'TotalTimeStopped_p80_minmax', 'DistanceToFirstStop_p20_minmax',
                'DistanceToFirstStop_p50_minmax', 'DistanceToFirstStop_p80_minmax']

pca = PCA(n_components=3, random_state=5)

principalComponents = pca.fit_transform(train[min_max_cols])
principalDf = pd.DataFrame(principalComponents)
pca.explained_variance_ratio_

array([0.66396904, 0.17536384, 0.07856878])

<h1>Feature Engineering</h1>

Time and day features

In [5]:
#making hour columns cyclical
train = helpers.date_cyc_enc(train, 'Hour', 24)
test = helpers.date_cyc_enc(test, 'Hour', 24) 

#encoding time of day
train['is_day'] = train['Hour'].apply(lambda x: 1 if 7 < x < 18 else 0)
test['is_day'] = test['Hour'].apply(lambda x: 1 if 7 < x < 18 else 0)

train['is_morning'] = train['Hour'].apply(lambda x: 1 if 6 < x < 10 else 0)
test['is_morning'] = test['Hour'].apply(lambda x: 1 if 6 < x < 10 else 0)

train['is_night'] = train['Hour'].apply(lambda x: 1 if 17 < x < 20 else 0)
test['is_night'] = test['Hour'].apply(lambda x: 1 if 17 < x < 20 else 0)

#encoding weekend vs not weekend
train['is_day_weekend'] = np.where((train['is_day'] == 1) & (train['Weekend'] == 1), 1,0)
test['is_day_weekend'] = np.where((test['is_day'] == 1) & (train['Weekend'] == 1), 1,0)

train['is_mor_weekend'] = np.where((train['is_morning'] == 1) & (train['Weekend'] == 1), 1,0)
test['is_mor_weekend'] = np.where((test['is_morning'] == 1) & (train['Weekend'] == 1), 1,0)

train['is_nig_weekend'] = np.where((train['is_night'] == 1) & (train['Weekend'] == 1), 1,0)
test['is_nig_weekend'] = np.where((test['is_night'] == 1) & (train['Weekend'] == 1), 1,0)

Location and direction features

In [6]:
#intersection and city concat
train["Intersec"] = train["IntersectionId"].astype(str) + train["City"]
test["Intersec"] = test["IntersectionId"].astype(str) + test["City"]

#make numerical and drop concat
le = LabelEncoder()
le.fit(pd.concat([train["Intersec"],test["Intersec"]]).drop_duplicates().values)
train["Intersec"] = le.transform(train["Intersec"])
test["Intersec"] = le.transform(test["Intersec"])

train['EntryType'] = train['EntryStreetName'].apply(helpers.road_encode)
train['ExitType'] = train['ExitStreetName'].apply(helpers.road_encode)
test['EntryType'] = test['EntryStreetName'].apply(helpers.road_encode)
test['ExitType'] = test['ExitStreetName'].apply(helpers.road_encode)

#map directional encoding in in train and test
train['EntryHeading'] = train['EntryHeading'].map(helpers.directions)
train['ExitHeading'] = train['ExitHeading'].map(helpers.directions)
test['EntryHeading'] = test['EntryHeading'].map(helpers.directions)
test['ExitHeading'] = test['ExitHeading'].map(helpers.directions)

#heading differences will tell us the degree at which you traveled, (0 degrees is straight, 180 u turn)
train['diffHeading'] = train['EntryHeading']-train['ExitHeading']  
test['diffHeading'] = test['EntryHeading']-test['ExitHeading'] 

#if you stay on the same street feature
train["same_str"] = (train["EntryStreetName"] ==  train["ExitStreetName"]).astype(int)
test["same_str"] = (test["EntryStreetName"] ==  test["ExitStreetName"]).astype(int)

Secondary features, monthly rainfall by city 

In [7]:
# Concatenating the city and month into one variable
train['city_month'] = train["City"] + train["Month"].astype(str)
test['city_month'] = test["City"] + test["Month"].astype(str)

# Creating a new column by mapping the city_month variable to it's corresponding average monthly rainfall
train["average_rainfall"] = train['city_month'].map(helpers.monthly_rainfall)
test["average_rainfall"] = test['city_month'].map(helpers.monthly_rainfall)

Replace city w/ dummy variables, can't run cells above after this

In [8]:
#dummy variables for city, drops city
try:
    train = pd.get_dummies(train, columns=['City' ],prefix=['City'], drop_first=False)
    test = pd.get_dummies(test, columns=['City' ],prefix=['City'], drop_first=False)
except KeyError:
    pass

Scale lat and longitude

In [9]:
#wonder if this would be different with min max
scaler = preprocessing.StandardScaler()
for col in ['Latitude','Longitude']:
    scaler.fit(train[col].values.reshape(-1, 1))
    train[col] = scaler.transform(train[col].values.reshape(-1, 1))
    test[col] = scaler.transform(test[col].values.reshape(-1, 1))

In [10]:
train.drop(['RowId', 'Path','EntryStreetName','ExitStreetName'],axis=1, inplace=True)
test.drop(['RowId', 'Path','EntryStreetName','ExitStreetName'],axis=1, inplace=True)

In [11]:
final_features = ['IntersectionId', 'Latitude', 'Longitude', 'EntryHeading',
                    'ExitHeading', 'Hour', 'Weekend', 'Month',
                    'is_morning', 'is_night', 'is_day_weekend', 'is_mor_weekend',
                    'is_nig_weekend', 
                    #'Hour', 
                    'Hour_sin', 
                    'Hour_cos', 
                    'same_str', 'Intersec', 'EntryType',
                    'ExitType', 'diffHeading', 'average_rainfall', 'is_day',
                    'City_Boston', 'City_Chicago', 'City_Philadelphia', 
                    'City_Atlanta']

<h1>PySpark</h1>

In [12]:
import os

import findspark
findspark.init()

from pyspark import SparkConf, SparkContext
from pyspark.sql import SparkSession, SQLContext

from pyspark.sql.types import *
import pyspark.sql.functions as F
from pyspark.sql.functions import udf, col

from pyspark.ml.regression import LinearRegression
from pyspark.mllib.evaluation import RegressionMetrics

from pyspark.ml.tuning import ParamGridBuilder, CrossValidator, CrossValidatorModel
from pyspark.ml.feature import VectorAssembler, StandardScaler
from pyspark.ml.evaluation import RegressionEvaluator

In [13]:
# Visualization
from IPython.core.interactiveshell import InteractiveShell
InteractiveShell.ast_node_interactivity = "all"

pd.set_option('display.max_columns', 200)
pd.set_option('display.max_colwidth', 400)

from matplotlib import rcParams
sns.set(context='notebook', style='whitegrid', rc={'figure.figsize': (18,4)})
rcParams['figure.figsize'] = 18,4

%matplotlib inline
%config InlineBackend.figure_format = 'retina'

### creating Spark session

In [21]:
import pyspark

print(os.environ['SPARK_HOME'])
print(os.environ['JAVA_HOME'])
print(os.environ['HADOOP_HOME'])

C:\Users\emag3\Documents\Code\Spark\spark-2.4.4-bin-hadoop2.7
C:\Users\emag3\Java
C:\Users\emag3\Documents\Code\Spark\hadoop


In [32]:
def init_spark():
  spark = SparkSession.builder.appName("Intersection").getOrCreate()
  sc = spark.sparkContext
  return spark,sc

def main():
  spark,sc = init_spark()
  nums = sc.parallelize([1,2,3,4])
  print(nums.map(lambda x: x*x).collect())

<h4>testing function</h4>

In [39]:
init_spark()
main()

spark = SparkSession.builder.appName("Intersection").getOrCreate()
sc = spark.sparkContext

(<pyspark.sql.session.SparkSession at 0x124de7677c8>,
 <SparkContext master=local[*] appName=Intersection>)

[1, 4, 9, 16]


<h4>convert pandas dataframe to Apache Spark dataframe</h4>

In [40]:
sqlContext = SQLContext(sc)

sp_train = sqlContext.createDataFrame(train)
sp_test = sqlContext.createDataFrame(test)

In [41]:
sp_train.show(3)

+--------------+-------------------+------------------+------------+-----------+----+-------+-----+--------------------+--------------------+--------------------+--------------------+--------------------+---------------------+---------------------+---------------------+---------------------+---------------------+-----------------------+-----------------------+-----------------------+-----------------------+-----------------------+---------------------------+---------------------------+---------------------------+------------------------------+------------------------------+------------------------------+-------------------+------------------+------+----------+--------+--------------+--------------+--------------+--------+---------+--------+-----------+--------+----------+----------------+------------+-----------+------------+-----------------+
|IntersectionId|           Latitude|         Longitude|EntryHeading|ExitHeading|Hour|Weekend|Month|TotalTimeStopped_p20|TotalTimeStopped_p40|Tot

In [42]:
sp_test.show(3)

+--------------+-------------------+-------------------+------------+-----------+----+-------+-----+-------------------+------------------+------+----------+--------+--------------+--------------+--------------+--------+---------+--------+-----------+--------+----------+----------------+------------+-----------+------------+-----------------+
|IntersectionId|           Latitude|          Longitude|EntryHeading|ExitHeading|Hour|Weekend|Month|           Hour_sin|          Hour_cos|is_day|is_morning|is_night|is_day_weekend|is_mor_weekend|is_nig_weekend|Intersec|EntryType|ExitType|diffHeading|same_str|city_month|average_rainfall|City_Atlanta|City_Boston|City_Chicago|City_Philadelphia|
+--------------+-------------------+-------------------+------------+-----------+----+-------+-----+-------------------+------------------+------+----------+--------+--------------+--------------+--------------+--------+---------+--------+-----------+--------+----------+----------------+------------+---------

In [43]:
sp_train.count()

857409

In [None]:
sp_test.count()

<h1>Model building</h1>
Useful Vaiables:
<ul>
    <li>final_features - list final set of features for prediction</li>
    <li>target_cols - list of targets to predict</li>
    <li>train - full train data set with derived features</li>
    <li>test - full test data set with derived features</li>
</ul>

In [12]:
print("Train dataset shape: "+ str(train.shape))
print("Test dataset shape:  "+ str(test.shape))

Train dataset shape: (857409, 48)
Test dataset shape:  (1920335, 27)


In [13]:
#X and y for train set
X = train[final_features]
y = train[target_cols]
ys = [train[column] for column in target_cols]

#only run this at the end, to get final prediction
X_final = test[final_features]

#reduce mem usage on feature sets (X)
X = helpers.reduce_mem_usage(X)
X_final = helpers.reduce_mem_usage(X_final)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.int16)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.float64)
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  df[col] = df[col].astype(np.int8)


Mem. usage decreased to 68.69 Mb (47.5% reduction)
Mem. usage decreased to 153.84 Mb (47.5% reduction)


<h2>Model iteration and validation</h2>
<ul>
    <li>Below we split original train into 2 train/tests using different random seeds</li>
    <li>We do so to test accuracy of model before crating final sumbission, which we can't pre score</li>
    <li>First item is to test model code, uses random 10k sample to make sure code runs before we leave i
</ul>

In [14]:
validation_size = 0.25
#75% to train and 25% to test

#this is just for to ensure running models works head(10k), speed cuz 10k is small and it should run fast
SPEED = train.sample(10000,random_state=2)
X_SPEED = SPEED[final_features] 
Y_SPEED = SPEED[target_cols]

X_train_SPEED, X_validation_SPEED, Y_train_SPEED, Y_validation_SPEED = train_test_split(X_SPEED, Y_SPEED, test_size=validation_size, random_state=3)

<h3>Real Validators</h3>

In [15]:
X_train_1, X_validation_1, Y_train_1, Y_validation_1 = train_test_split(X, y, test_size=validation_size, random_state=7)
X_train_2, X_validation_2, Y_train_2, Y_validation_2 = train_test_split(X, y, test_size=validation_size, random_state=23232)

In [16]:
#sets is a list of X_train_1, X_validation_1, Y_train_1, Y_validation_1 for every target value
sets = []
for y_target in ys:
    sets.append(train_test_split(X, y_target, test_size=validation_size, random_state=6))

In [17]:
print(target_cols)

['TotalTimeStopped_p20', 'TotalTimeStopped_p50', 'TotalTimeStopped_p80', 'DistanceToFirstStop_p20', 'DistanceToFirstStop_p50', 'DistanceToFirstStop_p80']


In [18]:
from sklearn.ensemble import AdaBoostRegressor

DistanceToFirstStop_p80 = sets[-1]

clf = AdaBoostRegressor(n_estimators=60)
clf_results = helpers.run_model(clf,DistanceToFirstStop_p80[0], DistanceToFirstStop_p80[1], DistanceToFirstStop_p80[2], DistanceToFirstStop_p80[3])

In [19]:
#score: 215.33 > 193.5 > 188.39
clf_results

{'run_time': '0:01:02.673537', 'run_score': 172.2698934659224}

In [20]:
from sklearn.neighbors import KNeighborsRegressor

In [21]:
#n=5, score: 114.8
neigh = KNeighborsRegressor(n_neighbors=5)
clf_results = helpers.run_model(neigh,DistanceToFirstStop_p80[0], DistanceToFirstStop_p80[1], DistanceToFirstStop_p80[2], DistanceToFirstStop_p80[3])
clf_results

{'run_time': '0:00:20.635369', 'run_score': 115.42958679796664}

In [22]:
import numpy as np
from sklearn.linear_model import LinearRegression
reg = LinearRegression()
clf_results = helpers.run_model(reg,DistanceToFirstStop_p80[0], DistanceToFirstStop_p80[1], DistanceToFirstStop_p80[2], DistanceToFirstStop_p80[3])
clf_results

{'run_time': '0:00:00.987725', 'run_score': 150.92582321936717}

In [23]:
#reg = Ridge(alpha=0.005)
#clf_results = helpers.run_model(reg,DistanceToFirstStop_p80[0], DistanceToFirstStop_p80[1], DistanceToFirstStop_p80[2], DistanceToFirstStop_p80[3])
#clf_results

NameError: name 'Ridge' is not defined

In [24]:
from scipy import stats
from sklearn.linear_model import BayesianRidge, LinearRegression

In [25]:
reg = BayesianRidge(compute_score=True)
clf_results = helpers.run_model(reg,DistanceToFirstStop_p80[0], DistanceToFirstStop_p80[1], DistanceToFirstStop_p80[2], DistanceToFirstStop_p80[3])
clf_results

{'run_time': '0:00:02.160881', 'run_score': 150.92574848028391}

In [28]:
from sklearn.kernel_ridge import KernelRidge

#reg = KernelRidge(alpha=1.0)
#clf_results = helpers.run_model(reg,DistanceToFirstStop_p80[0], DistanceToFirstStop_p80[1], DistanceToFirstStop_p80[2], DistanceToFirstStop_p80[3])
#clf_results

In [29]:
from sklearn import tree
#score: 111.71
clf = tree.DecisionTreeRegressor()
clf_results = helpers.run_model(clf,DistanceToFirstStop_p80[0], DistanceToFirstStop_p80[1], DistanceToFirstStop_p80[2], DistanceToFirstStop_p80[3])
clf_results

{'run_time': '0:00:12.284308', 'run_score': 110.91236233221149}

In [31]:
from sklearn.datasets import load_diabetes
from sklearn.linear_model import RidgeCV
from sklearn.svm import LinearSVR
from sklearn.ensemble import RandomForestRegressor
#from sklearn.ensemble import StackingRegressor

In [33]:
reg = RandomForestRegressor(n_estimators=10,min_samples_split=3, n_jobs=5)
reg_results = helpers.run_model(reg,DistanceToFirstStop_p80[0], DistanceToFirstStop_p80[1], DistanceToFirstStop_p80[2], DistanceToFirstStop_p80[3])
reg_results

{'run_time': '0:00:43.873751', 'run_score': 86.83906446832533}

In [35]:
from sklearn.model_selection import RandomizedSearchCV

In [37]:
# Number of trees in random forest
n_estimators = [int(x) for x in np.linspace(start = 1, stop = 50, num = 10)]
# Number of features to consider at every split
max_features = ['auto', 'sqrt']
# Maximum number of levels in tree
max_depth = [int(x) for x in np.linspace(10, 110, num = 11)]
max_depth.append(None)
# Minimum number of samples required to split a node
min_samples_split = [2, 5, 10]
# Minimum number of samples required at each leaf node
min_samples_leaf = [1, 2, 4]
# Method of selecting samples for training each tree
bootstrap = [True, False]
# Create the random grid
random_grid = {'n_estimators': n_estimators,
               'max_features': max_features,
               'max_depth': max_depth,
               'min_samples_split': min_samples_split,
               'min_samples_leaf': min_samples_leaf,
               'bootstrap': bootstrap}
print(random_grid)
"""
{'bootstrap': [True, False],
 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],
 'max_features': ['auto', 'sqrt'],
 'min_samples_leaf': [1, 2, 4],
 'min_samples_split': [2, 5, 10],
 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}
 """

{'n_estimators': [1, 6, 11, 17, 22, 28, 33, 39, 44, 50], 'max_features': ['auto', 'sqrt'], 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, 110, None], 'min_samples_split': [2, 5, 10], 'min_samples_leaf': [1, 2, 4], 'bootstrap': [True, False]}


"\n{'bootstrap': [True, False],\n 'max_depth': [10, 20, 30, 40, 50, 60, 70, 80, 90, 100, None],\n 'max_features': ['auto', 'sqrt'],\n 'min_samples_leaf': [1, 2, 4],\n 'min_samples_split': [2, 5, 10],\n 'n_estimators': [200, 400, 600, 800, 1000, 1200, 1400, 1600, 1800, 2000]}\n "

In [38]:
# Use the random grid to search for best hyperparameters
# First create the base model to tune
rf = RandomForestRegressor()
# Random search of parameters, using 3 fold cross validation, 
# search across 100 different combinations, and use all available cores
rf_random = RandomizedSearchCV(estimator = rf, param_distributions = random_grid, n_iter = 100, cv = 3, verbose=2, random_state=42, n_jobs = -1)
# Fit the random search model
rf_random.fit(DistanceToFirstStop_p80[0], DistanceToFirstStop_p80[2])

Fitting 3 folds for each of 100 candidates, totalling 300 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed: 28.8min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 106.9min
[Parallel(n_jobs=-1)]: Done 300 out of 300 | elapsed: 193.8min finished


RandomizedSearchCV(cv=3, error_score='raise-deprecating',
                   estimator=RandomForestRegressor(bootstrap=True,
                                                   criterion='mse',
                                                   max_depth=None,
                                                   max_features='auto',
                                                   max_leaf_nodes=None,
                                                   min_impurity_decrease=0.0,
                                                   min_impurity_split=None,
                                                   min_samples_leaf=1,
                                                   min_samples_split=2,
                                                   min_weight_fraction_leaf=0.0,
                                                   n_estimators='warn',
                                                   n_jobs=None, oob_score=False,
                                                   random_sta...


In [39]:
rf_random.best_params_

{'n_estimators': 50,
 'min_samples_split': 10,
 'min_samples_leaf': 2,
 'max_features': 'auto',
 'max_depth': 40,
 'bootstrap': True}

In [None]:
"""
estimators = [('lr', RidgeCV()), ('svr', LinearSVR(random_state=42))]
reg = StackingRegressor(
    estimators=estimators,
    final_estimator=RandomForestRegressor(n_estimators=10,
                                          random_state=42)
clf_results = helpers.run_model(reg,DistanceToFirstStop_p80[0], DistanceToFirstStop_p80[1], DistanceToFirstStop_p80[2], DistanceToFirstStop_p80[3])
clf_results
    """

In [None]:
rf_20 = RandomForestRegressor(n_estimators=20,min_samples_split=3, n_jobs=-1)

In [None]:
rf_20_results = []
feature_importances = []
for x in sets: 
    rf_20_results.append(helpers.run_model(rf_20,x[0],x[1],x[2],x[3]))
    feature_importances.append(helpers.importance_df(rf_20, final_features))

In [None]:
rf_20_results

In [None]:
helpers.importance_df(rf_20, final_features)

In [None]:
target_cols

In [None]:
#models
rf_20 = RandomForestRegressor(n_estimators=20,min_samples_split=3, n_jobs=-1)
rf_30 = RandomForestRegressor(n_estimators=30,min_samples_split=3, n_jobs=-1)
models = [rf_20, rf_30]

In [None]:
runs_1=[]
for i,model in enumerate(models): 
    runs.append({'run_'+str(i): helpers.run_model(model, X_train_1, X_validation_1, Y_train_1, Y_validation_1)})

In [None]:
runs_1

In [None]:
runs_2=[]
for i,model in enumerate(models): 
    runs.append({'run_'+str(i): helpers.run_model(model, X_train_2, X_validation_2, Y_train_2, Y_validation_2)})

In [None]:
from sklearn import svm

sv = svm.SVR()

In [None]:
from sklearn.ensemble import ExtraTreesRegressor

etc = ExtraTreesRegressor(n_estimators=30,min_samples_split=3, n_jobs=-1)

In [None]:
etc_results = helpers.run_model(etc,X_train_1, X_validation_1, Y_train_1, Y_validation_1 )

In [None]:
etc_results

In [None]:
etc_results = RandomForestRegressor(n_estimators=40,min_samples_split=3, n_jobs=5)

In [None]:
helpers.run_model(rfg,X_train_1, X_validation_1, Y_train_1, Y_validation_1 )

Random charts below

In [None]:
# list of x locations for plotting
x_values = list(range(len(importances)))
# Make a bar chart
plt.bar(x_values, importances, orientation = 'vertical', color = 'r', edgecolor = 'k', linewidth = 1.2)
# Tick labels for x axis
plt.xticks(x_values, final_features, rotation='vertical')
# Axis labels and title
plt.ylabel('Importance'); plt.xlabel('Variable'); plt.title('Variable Importances')

In [None]:
# List of features sorted from most to least important
sorted_importances = [importance[1] for importance in feature_importances]
sorted_features = [importance[0] for importance in feature_importances]
# Cumulative importances
cumulative_importances = np.cumsum(sorted_importances)
# Make a line graph
plt.plot(x_values, cumulative_importances, 'g-')
# Draw line at 95% of importance retained
plt.hlines(y = 0.95, xmin=0, xmax=len(sorted_importances), color = 'r', linestyles = 'dashed')
# Format x ticks and labels
plt.xticks(x_values, sorted_features, rotation = 'vertical')
# Axis labels and title
plt.xlabel('Variable'); plt.ylabel('Cumulative Importance'); plt.title('Cumulative Importances');