In [1]:
#Import packages 
import matplotlib.pyplot as plt
import matplotlib as mpl
import pandas as pd
import seaborn as sns
import sklearn
import numpy as np
from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score
from sklearn import preprocessing
%matplotlib inline

In [2]:
#Import dataset forestfires.csv. 
df = pd.read_csv('forestfires.csv')

#Shows the first 5 data points in the dataset
df.head()

## As we can see, we have a problem with "month" and "day" being categorical variables however, we need to clean this up and transform them into numerical variables. 

Unnamed: 0,X,Y,month,day,FFMC,DMC,DC,ISI,temp,RH,wind,rain,area
0,7,5,mar,fri,86.2,26.2,94.3,5.1,8.2,51,6.7,0.0,0.0
1,7,4,oct,tue,90.6,35.4,669.1,6.7,18.0,33,0.9,0.0,0.0
2,7,4,oct,sat,90.6,43.7,686.9,6.7,14.6,33,1.3,0.0,0.0
3,8,6,mar,fri,91.7,33.3,77.5,9.0,8.3,97,4.0,0.2,0.0
4,8,6,mar,sun,89.3,51.3,102.2,9.6,11.4,99,1.8,0.0,0.0


In [3]:
#Create a set of dummy variables from the month and day variable
df = pd.get_dummies(df)

In [4]:
#Check columns of the data
df.columns

Index(['X', 'Y', 'FFMC', 'DMC', 'DC', 'ISI', 'temp', 'RH', 'wind', 'rain',
       'area', 'month_apr', 'month_aug', 'month_dec', 'month_feb', 'month_jan',
       'month_jul', 'month_jun', 'month_mar', 'month_may', 'month_nov',
       'month_oct', 'month_sep', 'day_fri', 'day_mon', 'day_sat', 'day_sun',
       'day_thu', 'day_tue', 'day_wed'],
      dtype='object')

In [5]:
#Drop one column of month and day, in order to avoid multicollinearity
df.drop(labels=['month_nov', 'day_sat'], axis=1, inplace=True)

In [6]:
#Check the range of values of each column
df.max()-df.min()

X               8.00
Y               7.00
FFMC           77.50
DMC           290.20
DC            852.70
ISI            56.10
temp           31.10
RH             85.00
wind            9.00
rain            6.40
area         1090.84
month_apr       1.00
month_aug       1.00
month_dec       1.00
month_feb       1.00
month_jan       1.00
month_jul       1.00
month_jun       1.00
month_mar       1.00
month_may       1.00
month_oct       1.00
month_sep       1.00
day_fri         1.00
day_mon         1.00
day_sun         1.00
day_thu         1.00
day_tue         1.00
day_wed         1.00
dtype: float64

In [7]:
#:: Linear Regression ::
#Import LinearRegression package
from sklearn.linear_model import LinearRegression

#Copy df into data and establish the target as 'area'
data = df.copy()
target = data.pop('area')

#LR = data and target
lr = LinearRegression(fit_intercept=True)
lr.fit(data, target)

LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [8]:
#Import MSE package 
#Look at R^2 and Root Mean Squared Error (RMSE)
from sklearn.metrics import mean_squared_error
print(lr.score(data, target))
predictions = lr.predict(data)
mse = mean_squared_error(target, predictions)
rmse = np.sqrt(mse)
print(rmse)


## R^2 = 0.04578 is very small; RMSE = 62.12 is large

0.04578209650808518
62.12143311792724


In [9]:
#Import package
import statsmodels.formula.api as smf
#Set up statistics; define the data attributes
df_attributes = df.columns.values.tolist()
number_of_columns = len(df_attributes)

#Create models to test each column
statistics = list()
for idx in range(0, number_of_columns - 1):
    model = smf.ols(formula = "area ~ " + 
                    df_attributes[idx], data = df).fit()
    
    title = 'Model: area ~ ' + df_attributes[idx]
#Print the model which allows us to see the relationship between X=all columns except for area and y=area. The relationship in terms of r-squared, p-value, log-likelihood.
    print()
    print(model.summary())
    print()
    statistics.append([model.f_pvalue, model.rsquared])


                            OLS Regression Results                            
Dep. Variable:                   area   R-squared:                       0.004
Model:                            OLS   Adj. R-squared:                  0.002
Method:                 Least Squares   F-statistic:                     2.077
Date:                Sat, 28 Apr 2018   Prob (F-statistic):              0.150
Time:                        21:36:14   Log-Likelihood:                -2879.4
No. Observations:                 517   AIC:                             5763.
Df Residuals:                     515   BIC:                             5771.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept      4.7049      6.304      0.746      0.


                            OLS Regression Results                            
Dep. Variable:                   area   R-squared:                       0.000
Model:                            OLS   Adj. R-squared:                 -0.002
Method:                 Least Squares   F-statistic:                  0.009029
Date:                Sat, 28 Apr 2018   Prob (F-statistic):              0.924
Time:                        21:36:14   Log-Likelihood:                -2880.4
No. Observations:                 517   AIC:                             5765.
Df Residuals:                     515   BIC:                             5773.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     13.0452      3.492      3.736      0.

                            OLS Regression Results                            
Dep. Variable:                   area   R-squared:                       0.003
Model:                            OLS   Adj. R-squared:                  0.001
Method:                 Least Squares   F-statistic:                     1.654
Date:                Sat, 28 Apr 2018   Prob (F-statistic):              0.199
Time:                        21:36:14   Log-Likelihood:                -2879.6
No. Observations:                 517   AIC:                             5763.
Df Residuals:                     515   BIC:                             5772.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     10.3070      3.425      3.009      0.0

In [10]:
#Summarize the model - the p-value and R-squared
statistics = pd.DataFrame(statistics, 
                              index=df_attributes[: number_of_columns - 1], 
                              columns=['p-value', 'R-squared'])
statistics.T

Unnamed: 0,X,Y,FFMC,DMC,DC,ISI,temp,RH,wind,rain,...,month_jun,month_mar,month_may,month_oct,month_sep,day_fri,day_mon,day_sun,day_thu,day_tue
p-value,0.150096,0.30851,0.362592,0.097335,0.262363,0.851418,0.026101,0.086271,0.779939,0.86731,...,0.644924,0.300773,0.887006,0.701822,0.199054,0.229755,0.630475,0.642497,0.648064,0.975874
R-squared,0.004018,0.002014,0.00161,0.005328,0.002439,6.8e-05,0.009573,0.005703,0.000152,5.4e-05,...,0.000413,0.002079,3.9e-05,0.000285,0.0032,0.0028,0.00045,0.000419,0.000405,2e-06


In [11]:
#Find statistically significant column
statistics[statistics['p-value'] < 0.05]

##temp is the only statistically significant column (p-value = 0.026) accounting for (1% of forest fires)

Unnamed: 0,p-value,R-squared
temp,0.026101,0.009573
area,0.0,1.0


In [12]:
#Linear regression relationship between area and temp
print((smf.ols(formula = "area ~ temp", data = df).fit()).summary())

                            OLS Regression Results                            
Dep. Variable:                   area   R-squared:                       0.010
Model:                            OLS   Adj. R-squared:                  0.008
Method:                 Least Squares   F-statistic:                     4.978
Date:                Sat, 28 Apr 2018   Prob (F-statistic):             0.0261
Time:                        21:36:15   Log-Likelihood:                -2878.0
No. Observations:                 517   AIC:                             5760.
Df Residuals:                     515   BIC:                             5768.
Df Model:                           1                                         
Covariance Type:            nonrobust                                         
                 coef    std err          t      P>|t|      [0.025      0.975]
------------------------------------------------------------------------------
Intercept     -7.4138      9.500     -0.780      0.4

In [13]:
## :: Train/Test Split ::
#Import LogisiticRegression and Train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split

#Create training and testing variabels
X_train, X_test, y_train, y_test = train_test_split(data, target, shuffle=True, test_size=0.5, random_state=0)

In [14]:
#Print variables
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)

(258, 27)
(259, 27)
(258,)
(259,)


In [15]:
## :: Cross Validation ::
#Import KFold
from sklearn.model_selection import KFold 
#Create array(2)
X = np.array([[1,2], [3,4], [1,2], [3,4]])
y = np.array([1,2,3,4])
#Define split
kf = KFold(n_splits=2)
kf.get_n_splits(X)
print(kf)
KFold(n_splits=2, random_state=None, shuffle=False)


KFold(n_splits=2, random_state=None, shuffle=False)


KFold(n_splits=2, random_state=None, shuffle=False)

In [16]:
#This function will split the data(df)
for train_index, test_index in kf.split(X):
    print("Train:", train_index, "Test:", test_index)
    X_train, X_test = X[train_index], X[test_index]
    y_train, y_test = y[train_index], y[test_index]

Train: [2 3] Test: [0 1]
Train: [0 1] Test: [2 3]


In [17]:
df['area'].unique()[0:5]

array([0.  , 0.36, 0.43, 0.47, 0.55])

In [18]:
#Import DecisionTreeRegressor and StandardScaler
from sklearn.tree import DecisionTreeRegressor
from sklearn.preprocessing import StandardScaler
scaler = StandardScaler()

In [19]:
#Criterior for the decision tree regressor is Mean Absolute Error(MAE)
dt = DecisionTreeRegressor(criterion='mae')
#Build a decision tree regressor from the training and testing set
dt.fit(scaler.fit_transform(X_train, y_test),
               scaler.fit_transform(X_test, y_train))
x=dt.predict(X_test)
print(np.sqrt(np.mean((y_test-x))))

1.5811388300841898




In [20]:
## :: Logisitic Regression ::
#Check if dataset size is sufficient (50 records per feature=good)
forestfires_dmy = pd.concat([df], axis=1)
forestfires_dmy.head()
forestfires_dmy.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 517 entries, 0 to 516
Data columns (total 28 columns):
X            517 non-null int64
Y            517 non-null int64
FFMC         517 non-null float64
DMC          517 non-null float64
DC           517 non-null float64
ISI          517 non-null float64
temp         517 non-null float64
RH           517 non-null int64
wind         517 non-null float64
rain         517 non-null float64
area         517 non-null float64
month_apr    517 non-null uint8
month_aug    517 non-null uint8
month_dec    517 non-null uint8
month_feb    517 non-null uint8
month_jan    517 non-null uint8
month_jul    517 non-null uint8
month_jun    517 non-null uint8
month_mar    517 non-null uint8
month_may    517 non-null uint8
month_oct    517 non-null uint8
month_sep    517 non-null uint8
day_fri      517 non-null uint8
day_mon      517 non-null uint8
day_sun      517 non-null uint8
day_thu      517 non-null uint8
day_tue      517 non-null uint8
day_wed      51

In [21]:
X = df.ix[:,(1,2,3,4,5,6,)].values
y = df.ix[:,0].values

.ix is deprecated. Please use
.loc for label based indexing or
.iloc for positional indexing

See the documentation here:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#ix-indexer-is-deprecated
  """Entry point for launching an IPython kernel.


In [22]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.5, random_state=0)

In [23]:
#Create the model
logreg = LogisticRegression()
logreg.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=None, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [24]:
y_pred = logreg.predict(X_test)

In [25]:
from sklearn.metrics import confusion_matrix 
confusion_matrix = confusion_matrix(y_test, y_pred)
confusion_matrix [0:5]


array([[ 6,  4,  0,  4,  0,  0,  4,  2,  0],
       [ 9,  6,  0, 16,  0,  0,  3,  3,  1],
       [ 1,  3,  0, 23,  0,  0,  4,  4,  1],
       [ 8,  4,  0, 19,  0,  0,  2,  2,  0],
       [ 0,  3,  0, 11,  0,  0,  1,  2,  0]])

In [26]:
from sklearn.metrics import classification_report
print(classification_report(y_test, y_pred))

             precision    recall  f1-score   support

          1       0.19      0.30      0.24        20
          2       0.29      0.16      0.20        38
          3       0.00      0.00      0.00        36
          4       0.14      0.54      0.22        35
          5       0.00      0.00      0.00        17
          6       1.00      0.02      0.04        46
          7       0.17      0.10      0.12        30
          8       0.48      0.80      0.60        30
          9       0.00      0.00      0.00         7

avg / total       0.33      0.23      0.17       259



  'precision', 'predicted', average, warn_for)
