In [None]:
target = 'area'

# Define the metrics

**RMSE** 

RMSE is the most popular evaluation metric used in regression problems. It follows an assumption that errors are unbiased and follow a normal distribution.

# Dependencies

In [None]:
#Import required libraries

import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
plt.style.use('ggplot')

import statsmodels.api as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.linear_model import LinearRegression

---

# Load and describe data

In [None]:
# path = location of file on the machine.
path = " "
df = pd.read_csv(path)

df.shape

In [None]:
# Get overview of data types u
#Type code here

In [None]:
# Get overview of data using describe
#Type code here

---

# Missing value treatment

In [None]:
# Get missing values in dataset
#Type code here

---

# Exploratory Data Analysis
   We will try out the following analysis on our dataset
   - Univariate 
   - Bivariate 
   - Multivariate

In [None]:
plt.rcParams["figure.figsize"] = 9,5

## Univariate analysis



### Begin with the target variable, `Area`

In [None]:
plt.figure(figsize=(16,5))
print("Skew: {}".format(df[target].skew()))
print("Kurtosis: {}".format(df[target].kurtosis()))
ax =                                                   #Type code here
plt.xticks([i for i in range(0,1200,50)])
plt.show()

In [None]:
ax =                                                   #Type code here

In [None]:
# Get outlier points
y_outliers =                                           #Type code here
y_outliers

### Independent columns 

In [None]:
# Separate categorical and numerical columns
dfa = df.drop(columns=target)
cat_columns = #Type Code here 
num_columns = #Type Code here

cat_columns,num_columns

### Categorical columns 

In [None]:
# analyzing categorical columns
plt.figure(figsize=(16,10))
for i,col in enumerate(cat_columns,1):
    plt.subplot(2,2,i)
    # Using seaborn plot a countplot for columns
    # Type Code here 
    
    plt.subplot(2,2,i+2)
    # Use bar plot to show a plot of value counts
    #Type Code here
    
    plt.ylabel(col)
    plt.xlabel('% distribution per category')
plt.tight_layout()
plt.show()    

### Numerical Columns

In [None]:
plt.figure(figsize=(18,40))
for i,col in enumerate(num_columns,1):
    plt.subplot(8,4,i)
    # Use seaborn to plot a kernel desity plot for columns
    # Type code here
    plt.subplot(8,4,i+10)
    # Use box-plot to show a plot of selected columns
    # Type code here
plt.tight_layout() 
plt.show()
num_data = df[num_columns]
pd.DataFrame(data=[num_data.skew(),num_data.kurtosis()],index=['skewness','kurtosis'])

## Bivariate analysis with our target variable

In [None]:
print(df['area'].describe(),'\n')
print(y_outliers)

In [None]:
# a categorical variable based on forest fire area damage
# No damage, low, moderate, high, very high

def area_cat(area): # Define a function that returns above categories for the range of values
                    #  0 , <=1, <=25, <=100, >100 respectively
 #Type code here






#Apply function to variable 'area'
#Type code here
df.head()


### Categorical columns

In [None]:
cat_columns

In [None]:
for col in cat_columns:
    cross = pd.crosstab(index=df['damage_category'],columns=df[col],normalize='index')
    # plot an image to visualize the didderent damage caetgories
    # Type code here
    plt.xlabel('% distribution per category')
    plt.xticks(np.arange(0,1.1,0.1))
    plt.title("Forestfire damage each {}".format(col))
plt.show()

### Numerical columns

In [None]:
plt.figure(figsize=(20,40))
for i,col in enumerate(num_columns,1):
    plt.subplot(10,1,i)
    if col in ['X','Y']:
        sns.swarmplot(data=df,x=col,y=target,hue='damage_category')
    else:
        # similar to above swarnplot use scatterplot
        # Type code here
plt.show()

## Multivariate analysis

In [None]:
selected_features =                                      # Drop the columns 'damage_category','day','month'
selected_features

In [None]:
sns.pairplot(df,hue='damage_category',vars=selected_features)
plt.show()

# Outlier treatment

Observed outliers are in the following columns:
1. area 
2. FFMC
2. ISI
3. rain

In [None]:
out_columns =                          # Define array containing names of columns to be treated during data modelling

# Preparing the data for modelling

- Encoding the categorical columns 

In [None]:
df =                                                   #use pandas library to get dummies for day and month variable

- Data transformations like `log,root,inverse,exponential`,etc

In [None]:
print(df[out_columns].describe())
np.log1p(df[out_columns]).skew(), np.log1p(df[out_columns]).kurtosis()

In [None]:
# FFMC and rain have high skew and kurtosis values, 
# Linear regression model cannot operate with such high values
# For FFMC - remove the outliers in them using z-score method
mask = df.loc[:,['FFMC']].apply(zscore).abs() < 3

# Most of the values in rain are 0.0, convert it as a categorical column
df['rain'] = df['rain'].apply(lambda x: ) #Finish the code

df = df[mask.values]
df.shape

In [None]:
out_columns.remove('rain')
# Use log transformation on remaining columns to be treated
# Type code here

In [None]:
df[out_columns].skew()

In [None]:
# This dataframe will be used for building ML model
df_ml = df.drop(columns=['damage_category']).copy()

---

# Linear Regression

In [None]:
X = df.drop(columns=['area','damage_category'])
y = df['area']

In [None]:
# Define and fit linear regression model
lr =               # Type code here


print(f'Intercept: {lr.intercept_}')
print(f'R^2 score: {lr.score(X, y)}')
pd.DataFrame({"Coefficients": lr.coef_}, index=X.columns)

# Improving Stats model

**Dropping columns to improve accuracy:**
    
By checking high Variance inflation factor and p-value we will decide whether to keep the column or drop it.

> R^2 = 1 - SSE(Sum of Square of Residuals)/SST (Sum of square Total)

Just by dropping constant we got a huge bump in adjusted R2 from `2.5%` to `40.6%`.

In [None]:
X = df.drop(columns=['area','damage_category'])
y = df['area']

In [None]:
def check_stats(X,y):
    vif = [variance_inflation_factor(X.values, i) for i in range(X.shape[1])]
    print(pd.DataFrame({'vif': vif}, index=X.columns).sort_values(by="vif",ascending=False)[:10])
    lin_reg = sm.OLS(y,X).fit()
    print(lin_reg.summary())
check_stats(X,y)

In [None]:
X.drop(columns=['FFMC'],inplace=True)
# check_stats(X,y)

In [None]:
# Similarly remove columns - Y, month_jul, day_thu, day_mon, month_aug (based on R2 score)