In [1]:
import pandas as pd
%matplotlib inline
import matplotlib.pyplot as plt
import seaborn as sns
import warnings
warnings.filterwarnings('ignore')
import numpy as np


from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import train_test_split, StratifiedKFold
from sklearn.model_selection import GridSearchCV, cross_val_score

from sklearn.metrics import roc_curve, auc
from sklearn.preprocessing import label_binarize
from sklearn.multiclass import OneVsRestClassifier
import matplotlib.pyplot as plt
import scikitplot as skplt

def compute_score(clf, X, y, scoring='mean_squared_error'):
    xval = cross_val_score(clf, X, y, cv = 5, scoring=scoring)
    return np.mean(xval)

In [2]:
# df = pd.read_csv('cotton.csv')
df = pd.read_csv('pollution2.csv')
df['wnd_dir'] = df['wnd_dir'].astype('category')
df["wnd_dir"] = df["wnd_dir"].cat.codes
df = df.astype('float32')
df.dtypes

pollution    float32
dew          float32
temp         float32
press        float32
wnd_dir      float32
wnd_spd      float32
snow         float32
rain         float32
dtype: object

In [3]:
df.head()

Unnamed: 0,pollution,dew,temp,press,wnd_dir,wnd_spd,snow,rain
0,129.0,-16.0,-4.0,1020.0,2.0,1.79,0.0,0.0
1,148.0,-15.0,-4.0,1020.0,2.0,2.68,0.0,0.0
2,159.0,-11.0,-5.0,1021.0,2.0,3.57,0.0,0.0
3,181.0,-7.0,-5.0,1022.0,2.0,5.36,1.0,0.0
4,138.0,-7.0,-5.0,1022.0,2.0,6.25,2.0,0.0


In [4]:
# df_X = df.loc[:, df.columns != 'AVGYLD']
# df_y = df['AVGYLD']
df_X = df.loc[:, df.columns != 'pollution']
df_y = df['pollution']

In [5]:
df_X_train, df_X_test, df_y_train, df_y_test = train_test_split(df_X, df_y, test_size=.2, random_state=0)

# LinearRegression

In [6]:
from sklearn.linear_model import LinearRegression
warnings.filterwarnings('ignore')

run_gs = False

if run_gs:
    
    parameter_grid = {
         'copy_X': [True, False],
         'normalize': [True, False]
     }

    forest = LinearRegression()

    grid_search = GridSearchCV(forest,
                               scoring='mean_squared_error',
                               param_grid=parameter_grid,
                               cv=5)

    LogitModel = grid_search.fit(df_X_train, df_y_train)
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Tuned Logistic Regression Parameters: {}'.format(grid_search.best_params_))

In [7]:
score_measure_list = ['neg_mean_absolute_error', 'neg_mean_squared_error']

lr_parameters = {'copy_X': True, 'normalize': True}
lr_Model = LinearRegression(**lr_parameters)

lr_Model.fit(df_X_train, df_y_train)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=True)

In [8]:
for score_measure in score_measure_list:
    res = compute_score(lr_Model, df_X, df_y, scoring = score_measure)
    res = round(res, 2) * -1
    if score_measure == 'neg_mean_absolute_error':
        print('Model: lr, measure: mae, score: {}'.format(res))
    elif score_measure == 'neg_mean_squared_error':
        print('Model: lr measure: mse, score: {}'.format(res))
        tmp = round(res ** 0.5, 2)
        print('Model: lr measure: rmse, score: {}'.format(tmp))

Model: lr, measure: mae, score: 58.99
Model: lr measure: mse, score: 6679.85
Model: lr measure: rmse, score: 81.73


# RandomForest

In [9]:
warnings.filterwarnings('ignore')

dt_tune_run = False

if dt_tune_run:
    parameter_grid = {
#          'max_features':range(0,1,1),
        'random_state':range(0,10,1),
        'min_samples_leaf':range(1,10,1),
        'min_samples_split': range(2, 20, 1)
    }
    forest = RandomForestRegressor(criterion='mse', max_depth=None,
                                    max_leaf_nodes=None,
                                    min_impurity_split=0.005, min_weight_fraction_leaf=0.0)

    grid_search = GridSearchCV(forest,
                               scoring='mean_squared_error',
                               param_grid=parameter_grid,
                               cv=5)

    grid_search.fit(df_X_train, df_y_train)
    DecisionTreeModel = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

In [10]:
rfr_parameters = {'min_samples_leaf': 4, 'min_samples_split': 19, 'random_state': 0}
rfr_Model = RandomForestRegressor(**rfr_parameters)
rfr_Model.fit(df_X_train, df_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=4, min_samples_split=19,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=0, verbose=0, warm_start=False)

In [11]:
for score_measure in score_measure_list:
    res = compute_score(rfr_Model, df_X, df_y, scoring = score_measure)
    res = round(res, 2) * -1
    if score_measure == 'neg_mean_absolute_error':
        print('Model: rfr, measure: mae, score: {}'.format(res))
    elif score_measure == 'neg_mean_squared_error':
        print('Model: rfr measure: mse, score: {}'.format(res))
        tmp = round(res ** 0.5, 2)
        print('Model: rfr measure: rmse, score: {}'.format(tmp))

Model: rfr, measure: mae, score: 55.21
Model: rfr measure: mse, score: 6448.74
Model: rfr measure: rmse, score: 80.3


# decision tree

In [12]:
warnings.filterwarnings('ignore')

dt_tune_run = False

if dt_tune_run:
    parameter_grid = {
#          'max_features':range(0,1,1),
        'random_state':range(0,10,1),
        'min_samples_leaf':range(1,10,1),
        'min_samples_split': range(2, 20, 1)
    }
    forest = DecisionTreeRegressor(criterion='mse', max_depth=None,
                                    max_leaf_nodes=None,
                                    min_impurity_split=0.005, min_weight_fraction_leaf=0.0,
                                    presort=False, splitter='random')

    grid_search = GridSearchCV(forest,
                               scoring='mean_squared_error',
                               param_grid=parameter_grid,
                               cv=5)

    grid_search.fit(df_X_train, df_y_train)
    DecisionTreeModel = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

In [13]:
dt_parameters = {'min_samples_leaf': 8, 'min_samples_split': 17, 'random_state': 9}
dt_Model = RandomForestRegressor(**dt_parameters)
dt_Model.fit(df_X_train, df_y_train)

RandomForestRegressor(bootstrap=True, criterion='mse', max_depth=None,
           max_features='auto', max_leaf_nodes=None,
           min_impurity_decrease=0.0, min_impurity_split=None,
           min_samples_leaf=8, min_samples_split=17,
           min_weight_fraction_leaf=0.0, n_estimators=10, n_jobs=1,
           oob_score=False, random_state=9, verbose=0, warm_start=False)

In [14]:
for score_measure in score_measure_list:
    res = compute_score(dt_Model, df_X, df_y, scoring = score_measure)
    res = round(res, 2) * -1
    if score_measure == 'neg_mean_absolute_error':
        print('Model: dt, measure: mae, score: {}'.format(res))
    elif score_measure == 'neg_mean_squared_error':
        print('Model: dt measure: mse, score: {}'.format(res))
        tmp = round(res ** 0.5, 2)
        print('Model: dt measure: rmse, score: {}'.format(tmp))

Model: dt, measure: mae, score: 54.73
Model: dt measure: mse, score: 6373.44
Model: dt measure: rmse, score: 79.83


# SVM

In [15]:
warnings.filterwarnings('ignore')

svm_tune_run = False

if svm_tune_run:
    parameter_grid = {
         'C': [0.001, 0.01, 0.1, 1, 10, 100, 1000],
         'gamma' : [0.001, 0.01, 0.1, 1]
     }

    forest = SVR(cache_size=200, coef0=0.0,
              degree=3, kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

    grid_search = GridSearchCV(forest,
                               scoring='mean_squared_error',
                               param_grid=parameter_grid,
                               cv=5)

    grid_search.fit(df_X_train, df_y_train)
    SVCModel = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

In [16]:
svm_parameters = {'C': 1000, 'gamma': 0.01}
svm_Model = SVR(**svm_parameters)
svm_Model.fit(df_X_train, df_y_train)

SVR(C=1000, cache_size=200, coef0=0.0, degree=3, epsilon=0.1, gamma=0.01,
  kernel='rbf', max_iter=-1, shrinking=True, tol=0.001, verbose=False)

In [17]:
for score_measure in score_measure_list:
    res = compute_score(svm_Model, df_X, df_y, scoring = score_measure)
    res = round(res, 2) * -1
    if score_measure == 'neg_mean_absolute_error':
        print('Model: dt, measure: mae, score: {}'.format(res))
    elif score_measure == 'neg_mean_squared_error':
        print('Model: dt measure: mse, score: {}'.format(res))
        tmp = round(res ** 0.5, 2)
        print('Model: dt measure: rmse, score: {}'.format(tmp))

Model: dt, measure: mae, score: 54.68
Model: dt measure: mse, score: 6580.01
Model: dt measure: rmse, score: 81.12


# xgboot

In [18]:
from xgboost import XGBRegressor
warnings.filterwarnings('ignore')

dt_tune_run = False

if dt_tune_run:
    parameter_grid = {
                'min_child_weight':[4,5], 'gamma':[i/10.0 for i in range(3,6)], 
                'subsample':[i/10.0 for i in range(6,11)],
                'colsample_bytree':[i/10.0 for i in range(6,11)], 'max_depth': [2,3,4]
    }
    forest = XGBRegressor(nthread=-1)

    grid_search = GridSearchCV(forest,
                               scoring='mean_squared_error',
                               param_grid=parameter_grid,
                               cv=5)

    grid_search.fit(df_X_train, df_y_train)
    DecisionTreeModel = grid_search
    parameters = grid_search.best_params_

    print('Best score: {}'.format(grid_search.best_score_))
    print('Best parameters: {}'.format(grid_search.best_params_))

In [19]:
xgb_parameters = {'colsample_bytree': 1.0, 'gamma': 0.3, 'max_depth': 4, 'min_child_weight': 4, 'subsample': 0.6}
xgb_Model = XGBRegressor(**xgb_parameters)
xgb_Model.fit(df_X_train, df_y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=1.0, gamma=0.3, learning_rate=0.1,
       max_delta_step=0, max_depth=4, min_child_weight=4, missing=None,
       n_estimators=100, n_jobs=1, nthread=None, objective='reg:linear',
       random_state=0, reg_alpha=0, reg_lambda=1, scale_pos_weight=1,
       seed=None, silent=True, subsample=0.6)

In [20]:
for score_measure in score_measure_list:
    res = compute_score(xgb_Model, df_X, df_y, scoring = score_measure)
    res = round(res, 2) * -1
    if score_measure == 'neg_mean_absolute_error':
        print('Model: dt, measure: mae, score: {}'.format(res))
    elif score_measure == 'neg_mean_squared_error':
        print('Model: dt measure: mse, score: {}'.format(res))
        tmp = round(res ** 0.5, 2)
        print('Model: dt measure: rmse, score: {}'.format(tmp))

Model: dt, measure: mae, score: 52.99
Model: dt measure: mse, score: 5867.25
Model: dt measure: rmse, score: 76.6
