In [1]:
import pandas as pd
import numpy as np
import xgboost as xgb

In [2]:
data = pd.read_csv("forest_addFire_thresh0.csv")

In [3]:
data.head()
data=data.replace(-9999,np.nan)

In [4]:
Me2=data.loc[data["Site"]=='Me2']

In [5]:
Blo=data.loc[data["Site"]=='Blo']

In [6]:
from collections import Counter
def detect_outliers(df,n,features):
    outlier_indices = []
    # iterate over features(columns)
    for col in features:
        # 1st quartile (25%)
        Q1 = np.percentile(df[col], 25)
        # 3rd quartile (75%)
        Q3 = np.percentile(df[col],75)
        # Interquartile range (IQR)
        IQR = Q3 - Q1
        # outlier step
        outlier_step = 1.5 * IQR
        # Determine a list of indices of outliers for feature col
        outlier_list_col = df[(df[col] < Q1 - outlier_step) | (df[col] > Q3 + outlier_step )].index
        # append the found outlier indices for col to the list of outlier indices 
        outlier_indices.extend(outlier_list_col)
    # select observations containing more than 2 outliers
    outlier_indices = Counter(outlier_indices)        
    multiple_outliers = list( k for k, v in outlier_indices.items() if v > n )
    return multiple_outliers   

# detect outliers from Age, SibSp , Parch and Fare


In [7]:
Outliers_to_drop = detect_outliers(Me2,2,["Net Radiation","Latent Heat","Sensible Heat","Soil Heat Flux","Air Temp",'Soil Temp','CO2 Flux'])
Me2 = Me2.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

  interpolation=interpolation)


In [8]:
Outliers_to_drop = detect_outliers(Blo,2,["Net Radiation","Latent Heat","Sensible Heat","Soil Heat Flux","Air Temp",'Soil Temp','CO2 Flux'])
Blo = Blo.drop(Outliers_to_drop, axis = 0).reset_index(drop=True)

  interpolation=interpolation)


In [9]:
X_train=Me2[["Net Radiation","Latent Heat","Sensible Heat","Soil Heat Flux","Air Temp",'Soil Temp','Fire']]

In [10]:
y_train=Me2["CO2 Flux"]

In [11]:
def MaxMinNormalization(x,Max,Min):  
    x = (x - Min) / (Max - Min);  
    return x;  
X_train=MaxMinNormalization(X_train,np.max(X_train),np.min(X_train))
y_train=MaxMinNormalization(y_train,np.max(y_train),np.min(y_train))

In [12]:
bst = xgb.XGBRegressor(
 learning_rate =0.01,
 n_estimators=5000,
 max_depth=4,
 min_child_weight=6,
 gamma=0,
 subsample=0.8,
 colsample_bytree=0.8,
 reg_alpha=0.005,
 objective= 'binary:logistic',
 nthread=4,
 scale_pos_weight=1,seed=1)
bst.fit(X_train, y_train)

XGBRegressor(base_score=0.5, booster='gbtree', colsample_bylevel=1,
       colsample_bytree=0.8, gamma=0, learning_rate=0.01, max_delta_step=0,
       max_depth=4, min_child_weight=6, missing=None, n_estimators=5000,
       n_jobs=1, nthread=4, objective='binary:logistic', random_state=0,
       reg_alpha=0.005, reg_lambda=1, scale_pos_weight=1, seed=1,
       silent=True, subsample=0.8)

In [13]:
bst.score(X_train, y_train, sample_weight=None)

0.95011537999904461

In [14]:
X_test=Blo[["Net Radiation","Latent Heat","Sensible Heat","Soil Heat Flux","Air Temp",'Soil Temp','Fire']]
y_test=Blo["CO2 Flux"]

In [15]:
X_test=MaxMinNormalization(X_test,np.max(X_test),np.min(X_test))
y_test=MaxMinNormalization(y_test,np.max(y_test),np.min(y_test))

In [16]:
bst.score(X_test,y_test)

-0.4098408764369641

In [37]:
from sklearn.metrics import r2_score

In [38]:
predictions = bst.predict(X_test)

In [39]:
predictions

array([ 0.52219367,  0.52219367,  0.49768004,  0.52219367,  0.52219367,
        0.49768004,  0.52219367,  0.52219367,  0.50643849,  0.54682535,
        0.54887676,  0.52290541,  0.54682535,  0.54887676,  0.52290541,
        0.54682535,  0.54887676,  0.52290541,  0.54682535,  0.54887676,
        0.5873065 ,  0.85060853,  0.81830984,  0.45992979,  0.45971403,
        0.48350373,  0.40923145,  0.40782565,  0.53693026,  0.55944866,
        0.54512751,  0.5262419 ,  0.51793891,  0.49733716,  0.4733564 ,
        0.75414604,  0.54324663,  0.53849053,  0.54053563,  0.53934896,
        0.53849053,  0.54053563,  0.53934896,  0.53849053,  0.54053563,
        0.53934896,  0.53849053,  0.54053563,  0.54316038,  0.53099209,
        0.53273648,  0.53162235,  0.81330734,  0.83058774,  0.83155173,
        0.8317858 ,  0.83144426,  0.83225387,  0.83263874,  0.83102912,
        0.83225387,  0.83263874,  0.83102912,  0.83225387,  0.83263874,
        0.83102912,  0.83225387,  0.83263874,  0.83169895,  0.75

In [40]:
y_test

0      0.543405
1      0.543311
2      0.540466
3      0.543405
4      0.543311
5      0.540466
6      0.543405
7      0.543311
8      0.540466
9      0.543405
10     0.543311
11     0.540466
12     0.543405
13     0.543311
14     0.540466
15     0.543405
16     0.543311
17     0.540466
18     0.543405
19     0.543311
20     0.540466
21     0.587516
22     0.620899
23     0.492230
24     0.422511
25     0.419219
26     0.359184
27     0.297242
28     0.308021
29     0.352183
         ...   
542    0.286929
543    0.273436
544    0.218747
545    0.235251
546    0.229493
547    0.226428
548    0.353268
549    0.389004
550    0.330538
551    0.317785
552    0.329552
553    0.413134
554    0.462206
555    0.428027
556    0.295007
557    0.407949
558    0.416735
559    0.435070
560    0.490892
561    0.497408
562    0.506278
563    0.501732
564    0.497408
565    0.506278
566    0.501732
567    0.497408
568    0.506278
569    0.501732
570    0.497408
571    0.505654
Name: CO2 Flux, Length: 

In [41]:
r2_score(predictions, y_test)

0.10732157432677281