# Import necessary libraries

In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
import seaborn as sns
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn import metrics
import statsmodels.formula.api as smf

# UDF's

In [None]:
# UDF to deal with outliers
# UDF to get summary of categorical variables
def conti_summ(x):
    return pd.Series([x.count(),x.isnull().sum(),x.sum(),x.mean(),x.var(),x.quantile(0.01),x.quantile(0.05),x.quantile(0.10),
                     x.quantile(0.25),x.quantile(0.50),x.quantile(0.75),x.quantile(0.90),x.quantile(0.95),x.quantile(0.99),
                     x.max()],
                    index=['Count','Null','Sum','Mean','Var','Q1','Q5','Q10','Q25','Q50','Q75',
                           'Q90','Q95','Q99','Max'])

In [None]:
# UDF to create dummy variables
def dummy_vars(df,colname):
    dummies = pd.get_dummies(df[colname],prefix=colname,drop_first=True)
    df.drop(colname,inplace=True,axis=1)
    df = pd.concat([df,dummies],axis=1)
    return df

# Import the dataset

In [None]:
fish = pd.read_csv('../input/fish-market/Fish.csv')

# Exploratory Data Analysis

In [None]:
fish.head()

In [None]:
fish.shape

In [None]:
fish.dtypes

In [None]:
fish['Species'].value_counts()

In [None]:
fish.isnull().sum()  # No missing values

In [None]:
sns.countplot(fish['Species'])
plt.show()

In [None]:
fish_group = fish.groupby('Species').mean()

In [None]:
fish_group

# Data Preparation

### Separating categorical and continuous variables

In [None]:
fish.columns

In [None]:
fish_continuous = fish.loc[:,fish.dtypes=='float64']

In [None]:
fish_continuous.head()

In [None]:
fish_categorical = fish[['Species']]

In [None]:
fish_categorical.head()

### Dummy variable Creation

In [None]:
fish_continuous.apply(lambda x: conti_summ(x)).T

In [None]:
# cliping the outliers
fish_continuous = fish_continuous.apply(lambda x: x.clip(lower=x.quantile(0.05),upper=x.quantile(0.95)))

In [None]:
fish_continuous.apply(lambda x: conti_summ(x)).T

In [None]:
fish_categorical = dummy_vars(fish_categorical,['Species'])

In [None]:
fish_categorical.head()

# Final Data for the Analysis

In [None]:
fish_final = pd.concat([fish_continuous,fish_categorical],axis=1)

In [None]:
fish_final.head()

In [None]:
fish_final.shape

In [None]:
fish_final.isnull().sum()

In [None]:
plt.figure(figsize=(10,8))
sns.heatmap(fish_final.corr(),annot=True,fmt='.2f')
plt.show()

# Checking Assumption of Linear regression


In [None]:
sns.distplot(fish_final['Weight'])
plt.show()

In [None]:
fish_final['Weight'].skew()

In [None]:
# Doing log transformation to make 'Weight' variable normally distributed
fish_final['ln_Weight'] = np.log(fish_final['Weight'])

In [None]:
sns.distplot(fish_final['ln_Weight'])
plt.show()

In [None]:
np.log(fish_final['Weight']).skew()

# Building Linear Regression Model

### Separating data into train and test

In [None]:
train_x,test_x,train_y,test_y = train_test_split(fish_final[fish_final.columns.difference(['Weight','ln_Weight'])],
                                                 fish_final['ln_Weight'],random_state=123,test_size=0.3)

In [None]:
train,test = train_test_split(fish_final,random_state=123,test_size=0.3)

In [None]:
print('No.of observations in train',train.shape)
print('No.of observations in test',test.shape)

In [None]:
train.columns

In [None]:
fish_log = smf.ols('''ln_Weight ~ Length1 + Length2 + Length3 + Height + Width + Species_Parkki + Species_Perch + 
                      Species_Pike + Species_Roach + Species_Smelt + Species_Whitefish''',train).fit()

In [None]:
print(fish_log.summary())

# Model 2

### VIF to find multicollinearity

In [None]:
model_param = '''ln_Weight ~ Length1   + Height   + Width +
                     Species_Smelt '''

In [None]:
from patsy import dmatrices
from statsmodels.stats.outliers_influence import variance_inflation_factor
Y,X = dmatrices(model_param,train,return_type='dataframe')
vif=pd.DataFrame()
vif['Features'] = X.columns
vif['Vif_value'] = [variance_inflation_factor(X.values,i) for i in range(X.shape[1])]

In [None]:
vif.sort_values(by='Vif_value',ascending=False)

In [None]:
fish_log_2 = smf.ols('''ln_Weight ~ Length1   + Height  + 
                     Species_Smelt ''',train).fit()

In [None]:
print(fish_log_2.summary())

In [None]:
np.mean(np.abs(train['ln_Weight'] - fish_log_2.predict(train)/train['ln_Weight']))

In [None]:
np.mean(np.abs(test['ln_Weight'] - fish_log_2.predict(test)/test['ln_Weight']))

In [None]:
train['Predicted'] = np.exp(fish_log_2.predict(train))

In [None]:
train.head()

In [None]:
test['Predicted'] = np.exp(fish_log_2.predict(test))

In [None]:
test.head()

# Random Forest Model

In [None]:
from sklearn.ensemble import RandomForestRegressor
fish_random = RandomForestRegressor()
fish_random

In [None]:
param_grid = {'max_depth':np.arange(3,11),'max_features':np.arange(1,7),'n_estimators':[60,80,100,150,200]}


In [None]:
from sklearn.model_selection import GridSearchCV
fish_grid = GridSearchCV(fish_random,param_grid=param_grid,verbose=True,cv=5,n_jobs=-1)
fish_grid.fit(train_x,train_y)

In [None]:
fish_grid.best_params_

In [None]:
# Running random forest model with best parameters
fish_random = RandomForestRegressor(max_depth= 8 ,max_features = 2,n_estimators =100,oob_score=True)

In [None]:
fish_random.fit(train_x,train_y)

In [None]:
metrics.mean_squared_error(fish_random.predict(train_x),train_y)

In [None]:
metrics.mean_squared_error(fish_random.predict(test_x),test_y)