# Import Data Set

In [None]:
import pandas as pd
data = pd.read_csv("WC_AT.csv")
data.head()

In [None]:
import pandas_profiling as pp
EDA_report= pp.ProfileReport(data)
EDA_report.to_file(output_file='wcATreport.html')

In [None]:
data.info()

# Correlation

In [None]:
import seaborn as sns
%matplotlib inline
sns.pairplot(data)

In [None]:
import matplotlib.pyplot as plt
plt.scatter("Waist","AT",data=data)
#heteroscedasticity

In [None]:
data.corr()



In [None]:
import seaborn as sns
sns.distplot(data['Waist'])



In [None]:
import seaborn as sns
sns.distplot(data['AT'])

## Fitting a Linear Regression Model

In [None]:

sns.boxplot(data.Waist)

In [None]:
import statsmodels.formula.api as smf
model_basic=smf.ols("AT~Waist",data=data).fit()
model_basic.summary()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
x=np.log(data.Waist)
y=data.AT
plt.scatter(x,y)
plt.xlabel("log(Waist)")
plt.ylabel("AT")

In [None]:
import statsmodels.formula.api as smf
data["logwaist"]=np.log(data.Waist)
#model with log transformation
model_log=smf.ols("AT~logwaist",data=data).fit()
model_log.summary()


In [None]:
import matplotlib.pyplot as plt
import numpy as np
x=data.Waist
y=np.log(data.AT)
plt.scatter(x,y)
plt.xlabel("Waist")
plt.ylabel("logAT")
data.corr()

In [None]:
# Exponential model
data["log_AT"]=np.log(data.AT)# creating a column for log of AT

#model with log_at

model_exp=smf.ols("log_AT~Waist",data=data).fit()
model_exp.summary()


In [None]:
sns.regplot(x="Waist", y="log_AT",data=data);

In [None]:
pred=np.exp(model_exp.predict(data.Waist))
pred

In [None]:
errors=data.AT-pred
errors.mean()

In [None]:
# As from the above visualization we can observe that log of AT and waist are having a curvilinear relation
# Model with Quadtratic equation y= bo+b1x+b2x^2
# y=a+bx+cx2
data["waist_sq"]=data.Waist*data.Waist
print(data.corr())
model_quad=smf.ols("log_AT~Waist+waist_sq",data=data).fit()
model_quad.summary()


In [None]:
#calculating RMSE
def RMSE(actual,pred):
    temp=np.sqrt(np.mean((actual-pred)*(actual-pred)))
    return temp

In [None]:
#RMSE for model_basic

pred_basic=model_basic.predict(data)
rmse_basic=RMSE(data.AT,pred_basic)
rmse_basic



In [None]:
#RMSE for model_log

pred_log=model_log.predict(data)
rmse_log=RMSE(data.AT,pred_log)
rmse_log

In [None]:
#RMSE for model_exp

pred_exp=np.exp(model_exp.predict(data))
rmse_exp=RMSE(data.AT,pred_exp)
rmse_exp

In [None]:
#RMSE for model_quad

pred_quad=np.exp(model_quad.predict(data))
rmse_quad=RMSE(data.AT,pred_quad)
rmse_quad

In [None]:
model_basic.aic

In [None]:
dict={"model":["basic","log","exp","quad"], "RMSE":[rmse_basic,rmse_log,rmse_exp,rmse_quad],"AIC":[model_basic.aic,model_log.aic,model_exp.aic,model_quad.aic]}
rmse_table=pd.DataFrame(dict)
rmse_table

In [None]:
#Coefficients
model_quad.params


In [None]:
#t and p-Values
print(model_quad.tvalues, '\n', model_quad.pvalues)    

In [None]:
#R squared values
(model_quad.rsquared,model_quad.rsquared_adj)

# Predict for new data point

In [None]:
#Predict for 165 Waist
newdata=pd.Series([76,101])

In [None]:
data_pred=pd.DataFrame(newdata,columns=['Waist'])
data_pred["waist_sq"]=data_pred.Waist*data_pred.Waist
np.exp(model_quad.predict(data_pred))