#  Data Analysis Exercises

In [2]:
import numpy as np
import pandas as pd
import statsmodels.api as sm
import statsmodels.formula.api as smf

# 1

In [3]:
house = pd.read_csv('houses_142.csv')
house.head()

Unnamed: 0,Year,LogSalePrice,SqftLot,SqftAbove,SqftBasement,SqftLawn,Floors,Bedrooms,Bathrooms,Kitchens,Condition,Grade,OnlineGrade,YearsSince1900Built
0,2014,12.309982,5650,1180,0,0,1.0,3,1.0,0,3,7,7.0,55
1,2014,13.195614,7242,2170,400,0,2.0,3,2.25,1,3,7,7.1,51
2,2015,12.100712,10000,770,0,3949,1.0,2,1.0,0,3,6,6.1,33
3,2014,13.311329,5000,1050,910,0,1.0,4,3.0,0,5,7,9.0,65
4,2015,13.142166,8080,1680,0,2790,1.0,3,2.0,0,3,8,7.5,87


In [4]:
house_train = house[house['Year'] <= 2014]
house_test = house[house['Year'] >= 2015]

len(house_train), len(house_test)

(14624, 6976)

In [5]:
ols = smf.ols(formula='LogSalePrice ~ SqftLot + SqftAbove + SqftBasement + SqftLawn + Floors + Bedrooms + Kitchens \
+ Condition + Grade + OnlineGrade + YearsSince1900Built', 
                 data=house_train)
model =ols.fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           LogSalePrice   R-squared:                       0.693
Model:                            OLS   Adj. R-squared:                  0.693
Method:                 Least Squares   F-statistic:                     3000.
Date:                Thu, 18 Mar 2021   Prob (F-statistic):               0.00
Time:                        18:16:50   Log-Likelihood:                -2684.3
No. Observations:               14624   AIC:                             5393.
Df Residuals:                   14612   BIC:                             5484.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               9.7898    

We have a summary of important information like the R squared value, etc about our model. Now we must do variable selection to predict the best, most accurate LogSalePrice. We will be calculating Variance Inflation Factor (VIF) for each variable and the variables with higher VIF values and p values will be taken out to make our results more fine-tuned.

In [6]:
from statsmodels.stats.outliers_influence import variance_inflation_factor

# The dataframe passed to VIF must include the intercept term. We add it the same way we did before.
def VIF(df, columns):
    values = sm.add_constant(df[columns]).values
    num_columns = len(columns)+1
    vif = [variance_inflation_factor(values, i) for i in range(num_columns)]
    return pd.Series(vif[1:], index=columns)

cols = ['SqftLot', 'SqftAbove', 'SqftBasement', 'SqftLawn', 'Floors', 'Bedrooms', 'Kitchens', 'Condition', 'Grade', \
        'OnlineGrade', 'YearsSince1900Built']
VIF(house_train, cols)

  return ptp(axis=axis, out=out, **kwargs)


SqftLot                58.260256
SqftAbove               3.564801
SqftBasement            1.527227
SqftLawn               57.629814
Floors                 18.027362
Bedrooms                1.614541
Kitchens               15.795897
Condition              13.101427
Grade                   8.483883
OnlineGrade            16.774044
YearsSince1900Built     1.799963
dtype: float64

In [7]:
ols = smf.ols(formula='LogSalePrice ~ SqftAbove + SqftBasement + Floors + Bedrooms + Kitchens \
+ Condition + Grade + OnlineGrade + YearsSince1900Built', 
                 data=house_train)
model =ols.fit()
print(model.summary())
cols = ['SqftAbove', 'SqftBasement', 'Floors', 'Bedrooms', 'Kitchens', 'Condition', 'Grade', \
        'OnlineGrade', 'YearsSince1900Built']
VIF(house_train, cols)

                            OLS Regression Results                            
Dep. Variable:           LogSalePrice   R-squared:                       0.693
Model:                            OLS   Adj. R-squared:                  0.693
Method:                 Least Squares   F-statistic:                     3666.
Date:                Thu, 18 Mar 2021   Prob (F-statistic):               0.00
Time:                        18:16:54   Log-Likelihood:                -2685.2
No. Observations:               14624   AIC:                             5390.
Df Residuals:                   14614   BIC:                             5466.
Df Model:                           9                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               9.7893    

  return ptp(axis=axis, out=out, **kwargs)


SqftAbove               3.369072
SqftBasement            1.519316
Floors                 17.928710
Bedrooms                1.598586
Kitchens               15.767033
Condition              13.093688
Grade                   8.481763
OnlineGrade            16.768349
YearsSince1900Built     1.797079
dtype: float64

After taking out the variables SqftLot and SqftLawn, the R squared value and adjusted R squared value did not change but the p-values went down. Some of the VIF values are still pretty high so I will be trying to take out Floors, OnlineGrade, Kitchens, and Condition and see what results show.

In [8]:
ols = smf.ols(formula='LogSalePrice ~ SqftAbove + SqftBasement  + Bedrooms + Kitchens \
+ Condition + Grade + OnlineGrade + YearsSince1900Built', 
                 data=house_train)
model =ols.fit()
print(model.summary())
cols = ['SqftAbove', 'SqftBasement', 'Bedrooms', 'Kitchens', 'Condition', 'Grade', \
        'OnlineGrade', 'YearsSince1900Built']
VIF(house_train, cols)

                            OLS Regression Results                            
Dep. Variable:           LogSalePrice   R-squared:                       0.693
Model:                            OLS   Adj. R-squared:                  0.693
Method:                 Least Squares   F-statistic:                     4119.
Date:                Thu, 18 Mar 2021   Prob (F-statistic):               0.00
Time:                        18:16:55   Log-Likelihood:                -2692.1
No. Observations:               14624   AIC:                             5402.
Df Residuals:                   14615   BIC:                             5471.
Df Model:                           8                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               9.8459    

  return ptp(axis=axis, out=out, **kwargs)


SqftAbove               3.360049
SqftBasement            1.518570
Bedrooms                1.590673
Kitchens                1.497389
Condition              13.088209
Grade                   8.468883
OnlineGrade            16.767514
YearsSince1900Built     1.530630
dtype: float64

In [9]:
ols = smf.ols(formula='LogSalePrice ~ SqftAbove + SqftBasement  + Bedrooms + Kitchens \
+ Condition  + OnlineGrade + YearsSince1900Built', 
                 data=house_train)
model =ols.fit()
print(model.summary())
cols = ['SqftAbove', 'SqftBasement', 'Bedrooms', 'Kitchens', 'Condition', \
        'OnlineGrade', 'YearsSince1900Built']
VIF(house_train, cols)

                            OLS Regression Results                            
Dep. Variable:           LogSalePrice   R-squared:                       0.693
Model:                            OLS   Adj. R-squared:                  0.692
Method:                 Least Squares   F-statistic:                     4705.
Date:                Thu, 18 Mar 2021   Prob (F-statistic):               0.00
Time:                        18:16:55   Log-Likelihood:                -2695.0
No. Observations:               14624   AIC:                             5406.
Df Residuals:                   14616   BIC:                             5467.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               9.8263    

  return ptp(axis=axis, out=out, **kwargs)


SqftAbove              2.994201
SqftBasement           1.493385
Bedrooms               1.581435
Kitchens               1.497017
Condition              5.306942
OnlineGrade            5.752407
YearsSince1900Built    1.444886
dtype: float64

I removed Floor which helped the model.Removing Online Grade and Conditions resulted in lower p-values and VIF values but also a big decrease in the R squared value so I decided to keep them and remove Grade instead. This kept my R squared value at 0.693 consistently and decreased both the p-values and the VIF values. The highest VIF values are slightly over 5 but when I tried removing them the R squared value dropped by a lot so I think it is best to keep them.

In [47]:
ols = smf.ols(formula='LogSalePrice ~ SqftAbove + SqftBasement  + Bedrooms + Kitchens \
+ Condition  + OnlineGrade + YearsSince1900Built', 
                 data=house_test)
model =ols.fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           LogSalePrice   R-squared:                       0.656
Model:                            OLS   Adj. R-squared:                  0.656
Method:                 Least Squares   F-statistic:                     1902.
Date:                Thu, 18 Mar 2021   Prob (F-statistic):               0.00
Time:                        11:23:20   Log-Likelihood:                -1708.3
No. Observations:                6976   AIC:                             3433.
Df Residuals:                    6968   BIC:                             3487.
Df Model:                           7                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               9.8581    

In [54]:
#now we want to see how the model is doing on the test set so we are gonna calculate the OSR squared value.
def osr2(model, x_train, y_train, x_test, y_test):
    y_pred = model.predict(x_test)
    SSE = np.sum((y_test - y_pred)**2)
    SST = np.sum((y_test - np.mean(y_train))**2) 
    return (1 - SSE/SST)
x_test = house_test[cols]
x_test = sm.add_constant(x_test)
y_train = house_train['LogSalePrice']
y_test = house_test['LogSalePrice']

  return ptp(axis=axis, out=out, **kwargs)


In [55]:
osr2(model, house_train[['SqftAbove', 'SqftBasement', 'Bedrooms', 'Kitchens', 'Condition', \
        'OnlineGrade', 'YearsSince1900Built']], y_train,
house_test[['SqftAbove', 'SqftBasement', 'Bedrooms', 'Kitchens', 'Condition', \
        'OnlineGrade', 'YearsSince1900Built']], y_test)

0.656477741133084

In [56]:
ols = smf.ols(formula='LogSalePrice ~ SqftLot + SqftAbove + SqftBasement + SqftLawn + Floors + Bedrooms + Kitchens \
+ Condition + Grade + OnlineGrade + YearsSince1900Built', 
                 data=house_test)
model =ols.fit()
print(model.summary())

                            OLS Regression Results                            
Dep. Variable:           LogSalePrice   R-squared:                       0.657
Model:                            OLS   Adj. R-squared:                  0.656
Method:                 Least Squares   F-statistic:                     1213.
Date:                Thu, 18 Mar 2021   Prob (F-statistic):               0.00
Time:                        11:30:41   Log-Likelihood:                -1702.5
No. Observations:                6976   AIC:                             3429.
Df Residuals:                    6964   BIC:                             3511.
Df Model:                          11                                         
Covariance Type:            nonrobust                                         
                          coef    std err          t      P>|t|      [0.025      0.975]
---------------------------------------------------------------------------------------
Intercept               9.7803    

The training set R squared is 0.693 and the OSR squared is 0.656477741133084. So comparing the original model with the new model, both R squared and OSR squared values stayed similar but the p-values and the VIF values improved by a lot. So overall, the new model has given us better results in predicting the LogSalePrice and can be considered useful because the p-values and VIF values got better

# 2

# a

300p = 10(1-p) + 10p (we have to consider both cases because the travelers will have to pay 10 dollars when they buy insurance whether the flights get cancelled or not and the other side, travelers pay 300 dollars if they buy insurance and 0 dollars if they do not <br>
300p + 10p = 10 - 10p + 10p <br>
300p = 10<br>
p = 1/30

# b

In [3]:
flight = pd.read_csv('flights_142.csv')
flight.head()

Unnamed: 0,Month,Cancelled,Airline,Destination,DayOfWeek,ScheduledDepartureHour,ScheduledTime
0,January,0,US,CLT,Thursday,0,286
1,January,0,DL,MSP,Thursday,0,217
2,January,0,AA,DFW,Thursday,0,195
3,January,0,UA,IAH,Thursday,0,218
4,January,0,UA,DEN,Thursday,5,146


In [4]:
flight_train = flight[flight['Month'] != 'March']
flight_test = flight[flight['Month'] == 'March']

len(flight_train), len(flight_test)

(24460, 13422)

In [None]:
logreg = smf.logit(formula = 'Cancelled ~ Month + Airline + Destination + DayOfWeek + ScheduledDepartureHour + ScheduledTime',
                       data = flight_train).fit(method = 'ncg')
print(logreg.summary())