In [1]:
#Southwest analysis premlim

In [34]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import warnings
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet,Lasso, LassoCV, ElasticNetCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from tabulate import tabulate

warnings.filterwarnings("ignore") 

%matplotlib inline

#Lets read the data
train = pd.read_csv("QueryResultTest_Rev.csv")
test = pd.read_csv("QueryResult.csv")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [35]:
#Data cleansing
all_data = pd.concat((train,
                      test))
#log transform the target:
train["rev"] = np.log1p(train["rev"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index  #Lets mark which features are very skewed.

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])


all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.rev


In [60]:
def rmse_cv(model, dataset):
    rmse= np.sqrt(-cross_val_score(model, dataset, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

def outputPredictionToCSV(predictionData, filename):
    arr = test.loc[:,'year']
    arr2 = test.loc[:,'quarter']
    arr3 = np.expm1(predictionData)
   
    df = pd.DataFrame(data=arr, columns = ['year'])
    df.applymap(int)
    
    df1 = pd.DataFrame(data=arr2, columns = ['quarter'])
    df1.applymap(int)
    
    df2 = pd.DataFrame(data=arr3, columns = ['predict.rev'])
        
    df4 = pd.concat([df,df1,df2], axis=1)
    dff = pd.DataFrame(data=df4) 
    dff.to_csv(filename+".csv", index=False)

def printRMSE_MSE(modelName,model,trainData):
    print(modelName +" has RMSE of "+ str(rmse_cv(model,trainData).mean()))
    yHat= model.predict(trainData) 
    print(modelName + " has MSE on train data is: "+ str(mean_squared_error(y,yHat)))

def printActuals(predictionData):
    act_data = pd.read_csv("QueryResult_Rev.csv")
#     print(act_data)
    arr3 = np.expm1(predictionData)
    percentErrors =[]
    for x in range(len(act_data.rev)):
        percentError= (100*(abs(act_data.rev[x]-arr3[x])/act_data.rev[x]))
        print("Quarter : "+str(act_data.quarter[x])+" Year: "+str(act_data.year[x])+" : Actual: "+str(act_data.rev[x])+" ,Prediction: "+str(arr3[x])+" ,Error: "+ str(percentError)+"%")
        percentErrors.append(percentError)
    print("Avg Percent Error is : "+str(sum(percentErrors) / len(percentErrors) ))
        

In [61]:
# ridge regression
modelForProblem1= Ridge(alpha=.1) .fit(X_train, y)

printRMSE_MSE('ridge_regression_withGivenAlpha',modelForProblem1,X_train)

predictforModel1= modelForProblem1.predict(X_test)
outputPredictionToCSV(predictforModel1,'ridge_regression_withGivenAlpha_output')


ridge_regression_withGivenAlpha has RMSE of 0.0528045431453785
ridge_regression_withGivenAlpha has MSE on train data is: 0.00011834839206927143


In [62]:
# Find the best Alpha for a ridge regression and a lasso regression model.

alphas = [0.000005, 0.00005, 0.00001, 0.0005, 0.0001, 0.005, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 7, 8, 9,9.5, 10, 10.5, 11, 15, 30, 50, 75, 100]

cv_ridge = [rmse_cv(Ridge(alpha = alpha),X_train).mean() 
            for alpha in alphas]
cv_ridge = pd.Series(cv_ridge, index = alphas)
print("The best ridge regression : "+str(cv_ridge.min()) +" with Alpha as " + str(cv_ridge.idxmin()))
print("The top five are:\n"+ str(cv_ridge.sort_values().head(5)))



cv_lasso = [rmse_cv(Lasso(alpha = alpha),X_train).mean() 
            for alpha in alphas]
cv_lasso = pd.Series(cv_lasso, index = alphas)
print("\n\nThe best lasso regression : " +str(cv_lasso.min()) +" with Alpha as " + str(cv_lasso.idxmin()))
print("The top five are:\n"+ str(cv_lasso.sort_values().head(5)))

bestRidgeRegressionAlphaValue = cv_ridge.idxmin()
bestLassoRegressionAlphaValue = cv_lasso.idxmin()

The best ridge regression : 0.0340797048265295 with Alpha as 5.0
The top five are:
5.0    0.034080
7.0    0.034094
8.0    0.034110
9.0    0.034129
9.5    0.034138
dtype: float64


The best lasso regression : 0.03184616720321791 with Alpha as 0.0005
The top five are:
0.0005    0.031846
0.0050    0.032383
0.0500    0.034090
0.1000    0.035930
0.0001    0.040202
dtype: float64


In [63]:
# ridge regression with best alpha
modelForProblem2= Ridge(alpha = bestRidgeRegressionAlphaValue).fit(X_train, y)

printRMSE_MSE('ridge_regression_withBestAlpha',modelForProblem2,X_train)

predictforModel2= modelForProblem2.predict(X_test)
outputPredictionToCSV(predictforModel2,'ridge_regression_WithBestAlpha_output')

ridge_regression_withBestAlpha has RMSE of 0.0340797048265295
ridge_regression_withBestAlpha has MSE on train data is: 0.00013014101763361284


In [64]:
# Lasso with best alpha
modelForProblem2b= Lasso(alpha = bestLassoRegressionAlphaValue).fit(X_train, y)

printRMSE_MSE('lasso_regression_withBestAlpha',modelForProblem2b,X_train)

predictforModel2b= modelForProblem2b.predict(X_test)
outputPredictionToCSV(predictforModel2b,'lasso_regression_WithBestAlpha_output')




lasso_regression_withBestAlpha has RMSE of 0.03184616720321791
lasso_regression_withBestAlpha has MSE on train data is: 0.00012940690560303978


In [65]:
printActuals(predictforModel1)

Quarter : 2 Year: 2019 : Actual: 5909 ,Prediction: 5812.028048710654 ,Error: 1.6410890385741443%
Quarter : 1 Year: 2019 : Actual: 5149 ,Prediction: 4015.0329514624605 ,Error: 22.023053962663422%
Quarter : 4 Year: 2018 : Actual: 5704 ,Prediction: 3936.220076205935 ,Error: 30.99193414786229%
Quarter : 3 Year: 2018 : Actual: 5575 ,Prediction: 3955.9634524019084 ,Error: 29.041014306692226%
Quarter : 2 Year: 2018 : Actual: 5742 ,Prediction: 4137.463394116026 ,Error: 27.943862868059448%
Quarter : 1 Year: 2018 : Actual: 4944 ,Prediction: 4349.357618438553 ,Error: 12.027556261356132%
Quarter : 4 Year: 2017 : Actual: 5258 ,Prediction: 4341.655009903634 ,Error: 17.427633893046142%
Quarter : 3 Year: 2017 : Actual: 5303 ,Prediction: 4057.3051151158843 ,Error: 23.49038063141836%
Quarter : 2 Year: 2017 : Actual: 5731 ,Prediction: 4058.2467983330484 ,Error: 29.187806694589973%
Quarter : 1 Year: 2017 : Actual: 4854 ,Prediction: 4122.840107426938 ,Error: 15.063038577937007%
Quarter : 4 Year: 2016 : Act

In [66]:
printActuals(predictforModel2)

Quarter : 2 Year: 2019 : Actual: 5909 ,Prediction: 4930.040790173856 ,Error: 16.567256893317715%
Quarter : 1 Year: 2019 : Actual: 5149 ,Prediction: 3863.9577872948125 ,Error: 24.957122017968295%
Quarter : 4 Year: 2018 : Actual: 5704 ,Prediction: 3785.6833836783617 ,Error: 33.63107672373138%
Quarter : 3 Year: 2018 : Actual: 5575 ,Prediction: 3800.61351225537 ,Error: 31.82756031828933%
Quarter : 2 Year: 2018 : Actual: 5742 ,Prediction: 3917.185655767601 ,Error: 31.780117454413077%
Quarter : 1 Year: 2018 : Actual: 4944 ,Prediction: 4112.823446308621 ,Error: 16.811823496993913%
Quarter : 4 Year: 2017 : Actual: 5258 ,Prediction: 4038.6338278719895 ,Error: 23.190684140890273%
Quarter : 3 Year: 2017 : Actual: 5303 ,Prediction: 3906.531062808932 ,Error: 26.333564721687125%
Quarter : 2 Year: 2017 : Actual: 5731 ,Prediction: 3924.1437713916253 ,Error: 31.527765287181552%
Quarter : 1 Year: 2017 : Actual: 4854 ,Prediction: 4054.4633544815865 ,Error: 16.47170674739212%
Quarter : 4 Year: 2016 : Actu

In [67]:
printActuals(predictforModel2b)

Quarter : 2 Year: 2019 : Actual: 5909 ,Prediction: 5088.8390978021735 ,Error: 13.879859573495118%
Quarter : 1 Year: 2019 : Actual: 5149 ,Prediction: 3855.148429945086 ,Error: 25.128210721594762%
Quarter : 4 Year: 2018 : Actual: 5704 ,Prediction: 3776.7752963513663 ,Error: 33.78724936270396%
Quarter : 3 Year: 2018 : Actual: 5575 ,Prediction: 3794.040600066218 ,Error: 31.945460088498333%
Quarter : 2 Year: 2018 : Actual: 5742 ,Prediction: 3935.690070370278 ,Error: 31.457853180594253%
Quarter : 1 Year: 2018 : Actual: 4944 ,Prediction: 4133.62638822968 ,Error: 16.391052018008093%
Quarter : 4 Year: 2017 : Actual: 5258 ,Prediction: 4074.914626269377 ,Error: 22.50067276018682%
Quarter : 3 Year: 2017 : Actual: 5303 ,Prediction: 3910.226223818641 ,Error: 26.263884144472172%
Quarter : 2 Year: 2017 : Actual: 5731 ,Prediction: 3931.1973947258007 ,Error: 31.40468688316523%
Quarter : 1 Year: 2017 : Actual: 4854 ,Prediction: 4042.3469234071863 ,Error: 16.721324198451047%
Quarter : 4 Year: 2016 : Actua