In [1]:
#Southwest analysis premlim

In [4]:
import pandas as pd
import numpy as np
from scipy.stats import skew
import warnings
from sklearn.linear_model import Ridge, RidgeCV, ElasticNet,Lasso, LassoCV, ElasticNetCV
from sklearn.model_selection import cross_val_score
from sklearn.metrics import mean_squared_error
from tabulate import tabulate

warnings.filterwarnings("ignore") 

%matplotlib inline

#Lets read the data
train = pd.read_csv("QueryResultTest_Rev.csv")
test = pd.read_csv("QueryResult.csv")
pd.set_option('display.max_columns', None)
pd.set_option('display.max_rows', None)

In [5]:
#Data cleansing
all_data = pd.concat((train,
                      test))
#log transform the target:
train["rev"] = np.log1p(train["rev"])

#log transform skewed numeric features:
numeric_feats = all_data.dtypes[all_data.dtypes != "object"].index

skewed_feats = train[numeric_feats].apply(lambda x: skew(x.dropna())) #compute skewness
skewed_feats = skewed_feats[skewed_feats > 0.75]
skewed_feats = skewed_feats.index  #Lets mark which features are very skewed.

all_data[skewed_feats] = np.log1p(all_data[skewed_feats])


all_data = pd.get_dummies(all_data)
all_data = all_data.fillna(all_data.mean())
X_train = all_data[:train.shape[0]]
X_test = all_data[train.shape[0]:]
y = train.rev


In [6]:
def rmse_cv(model, dataset):
    rmse= np.sqrt(-cross_val_score(model, dataset, y, scoring="neg_mean_squared_error", cv = 5))
    return(rmse)

def outputPredictionToCSV(predictionData, filename):
    arr = test.loc[:,'quarterFlight.year']
    arr2 = test.loc[:,'quarterFlight.quarter']
    arr3 = np.expm1(predictionData)
   
    df = pd.DataFrame(data=arr, columns = ['quarterFlight.year'])
    df.applymap(int)
    
    df1 = pd.DataFrame(data=arr2, columns = ['quarterFlight.quarter'])
    df1.applymap(int)
    
    df2 = pd.DataFrame(data=arr3, columns = ['predict.rev'])
        
    df4 = pd.concat([df,df1,df2], axis=1)
    dff = pd.DataFrame(data=df4) 
    dff.to_csv(filename+".csv", index=False)

def printRMSE_MSE(modelName,model,trainData):
    print(modelName +" has RMSE of "+ str(rmse_cv(model,trainData).mean()))
    yHat= model.predict(trainData) 
    print(modelName + " has MSE on train data is: "+ str(mean_squared_error(y,yHat)))

def printActuals(predictionData):
    act_data = pd.read_csv("QueryResult_Rev.csv")
    arr3 = np.expm1(predictionData)
    percentErrors =[]
    for x in range(len(act_data.rev)):
        percentError= (100*(abs(act_data.rev[x]-arr3[x])/act_data.rev[x]))
        print(str(act_data.rev[x])+", "+str(arr3[x])+", "+ str(percentError)+"%")
        percentErrors.append(percentError)
    print("Avg Percent Error is : "+str(sum(percentErrors) / len(percentErrors) ))
        

In [7]:
# ridge regression
modelForProblem1= Ridge(alpha=.1) .fit(X_train, y)

printRMSE_MSE('ridge_regression_withGivenAlpha',modelForProblem1,X_train)

predictforModel1= modelForProblem1.predict(X_test)
outputPredictionToCSV(predictforModel1,'ridge_regression_withGivenAlpha_output')


ridge_regression_withGivenAlpha has RMSE of 0.05280454314537678
ridge_regression_withGivenAlpha has MSE on train data is: 0.00011834839206926356


In [8]:
# Find the best Alpha for a ridge regression and a lasso regression model.

alphas = [0.000005, 0.00005, 0.00001, 0.0005, 0.0001, 0.005, 0.05, 0.1, 0.3, 0.5, 1, 3, 5, 7, 8, 9,9.5, 10, 10.5, 11, 15, 30, 50, 75, 100]

cv_ridge = [rmse_cv(Ridge(alpha = alpha),X_train).mean() 
            for alpha in alphas]
cv_ridge = pd.Series(cv_ridge, index = alphas)
print("The best ridge regression : "+str(cv_ridge.min()) +" with Alpha as " + str(cv_ridge.idxmin()))
print("The top five are:\n"+ str(cv_ridge.sort_values().head(5)))



cv_lasso = [rmse_cv(Lasso(alpha = alpha),X_train).mean() 
            for alpha in alphas]
cv_lasso = pd.Series(cv_lasso, index = alphas)
print("\n\nThe best lasso regression : " +str(cv_lasso.min()) +" with Alpha as " + str(cv_lasso.idxmin()))
print("The top five are:\n"+ str(cv_lasso.sort_values().head(5)))

bestRidgeRegressionAlphaValue = cv_ridge.idxmin()
bestLassoRegressionAlphaValue = cv_lasso.idxmin()

The best ridge regression : 0.034079704826530124 with Alpha as 5.0
The top five are:
5.0    0.034080
7.0    0.034094
8.0    0.034110
9.0    0.034129
9.5    0.034138
dtype: float64


The best lasso regression : 0.03181106515548849 with Alpha as 0.0005
The top five are:
0.0005    0.031811
0.0050    0.032404
0.0500    0.034089
0.1000    0.035928
0.3000    0.043163
dtype: float64


In [9]:
# ridge regression with best alpha
modelForProblem2= Ridge(alpha = bestRidgeRegressionAlphaValue).fit(X_train, y)

printRMSE_MSE('ridge_regression_withBestAlpha',modelForProblem2,X_train)

predictforModel2= modelForProblem2.predict(X_test)
outputPredictionToCSV(predictforModel2,'ridge_regression_WithBestAlpha_output')

ridge_regression_withBestAlpha has RMSE of 0.034079704826530124
ridge_regression_withBestAlpha has MSE on train data is: 0.00013014101763360116


In [10]:
# Lasso with best alpha
modelForProblem2b= Lasso(alpha = bestLassoRegressionAlphaValue).fit(X_train, y)

printRMSE_MSE('lasso_regression_withBestAlpha',modelForProblem2b,X_train)

predictforModel2b= modelForProblem2b.predict(X_test)
outputPredictionToCSV(predictforModel2b,'lasso_regression_WithBestAlpha_output')




lasso_regression_withBestAlpha has RMSE of 0.03181106515548849
lasso_regression_withBestAlpha has MSE on train data is: 0.0001273116383557397


In [11]:
printActuals(predictforModel1)

5909, 5812.028048710942, 1.641089038569265%
5149, 4015.032951462546, 22.02305396266176%
5704, 3936.220076206019, 30.99193414786082%
5575, 3955.963452401965, 29.041014306691217%
5742, 4137.463394116115, 27.943862868057913%
4944, 4349.357618438584, 12.027556261355505%
5258, 4341.655009903757, 17.427633893043804%
5303, 4057.305115116, 23.490380631416183%
5731, 4058.246798333135, 29.18780669458847%
4854, 4122.840107426909, 15.063038577937608%
4940, 4057.6837986069827, 17.86065185006108%
5139, 4066.563573559025, 20.868581950593015%
5384, 4067.89516075906, 24.444740699125926%
4826, 3833.835718478963, 20.55872941402895%
Avg Percent Error is : 20.897862449713678


In [12]:
printActuals(predictforModel2)

5909, 4930.040790173707, 16.567256893320238%
5149, 3863.957787294854, 24.95712201796749%
5704, 3785.683383678402, 33.63107672373068%
5575, 3800.613512255404, 31.82756031828871%
5742, 3917.185655767601, 31.780117454413077%
4944, 4112.823446308621, 16.811823496993913%
5258, 4038.6338278719895, 23.190684140890273%
5303, 3906.531062808953, 26.33356472168673%
5731, 3924.143771391646, 31.527765287181186%
4854, 4054.463354481601, 16.47170674739182%
4940, 3975.5004994132855, 19.52428138839503%
5139, 3987.291527127937, 22.411139771785617%
5384, 3982.5935481447204, 26.02909457383506%
4826, 4033.8414921175827, 16.414390963166543%
Avg Percent Error is : 24.105541749931884


In [13]:
printActuals(predictforModel2b)

5909, 5273.9160104395305, 10.747740557801142%
5149, 3868.1246189350722, 24.876196952125223%
5704, 3791.8244320831072, 33.52341458479826%
5575, 3809.3417110140003, 31.671000699300443%
5742, 3967.336384846467, 30.906715694070584%
4944, 4164.511957332092, 15.766343905095232%
5258, 4125.427931034031, 21.539978489272904%
5303, 3927.633057615744, 25.935639117183783%
5731, 3947.6978657129475, 31.116770795446737%
4854, 4039.5069644972696, 16.77983179857294%
4940, 3975.4445703296324, 19.525413556080316%
5139, 3989.20189044652, 22.373965937993386%
5384, 3998.488321987451, 25.73387217705329%
4826, 3879.503162441335, 19.61245001157615%
Avg Percent Error is : 23.57923816259788
