## Simple Linear Regression 
### smoking and cancer

In [None]:
# Import packages
import pandas as pd
import statsmodels.api as sm
import matplotlib.pyplot as plt

# The fundamental data type of pandas is the data frame

In [None]:
# Get the data and view it
data = pd.read_csv('smoking_cancer.csv')
data.head()

## H0: There is NO correlation between smoking and cancer
## H1: There IS correlation between smoking and cancer

In [None]:
data.shape

In [None]:
data.columns

In [None]:
# plot the data
plt.scatter(data['smoking'], data['cancer'])
plt.show()

In [None]:
# correlation between cancer and smoking
data['smoking'].corr(data['cancer'])

### Build the linear model

In [None]:
# define the variables

# dependent variable (output) - y - cancer
y = data['cancer']

# independent variable (input) - x - smoking
x = data['smoking']
# to x add a column of ones
x = sm.add_constant(x)

In [None]:
y.head()

In [None]:
x.head()

In [None]:
# Create and fit the model
# NOTE: the first parameter is y, the second is x
model = sm.OLS(y,x)

In [None]:
# Get the results
results = model.fit()
print(results.summary())

The p-value for smoking is below 0.05. This variable is statistically significant and can be used as a predictor.

In [None]:
# model coefficients
results.params

In [None]:
# plot the regression line
plt.scatter(data['smoking'], data['cancer'], color = 'blue')
plt.plot(data['smoking'],results.fittedvalues, color='red')
plt.show()

In [None]:
# prediction (in sample)
yp = results.fittedvalues
print('predicted y:',yp,sep='\n')

In [None]:
# prediction (in sample) another way
yp = results.predict(x)
print('predicted y:',yp,sep='\n')

In [None]:
# prediction (out of sample)
data_new = [ [25], [35], [50] ]
x_new = pd.DataFrame(data_new, columns = ['smoking'])
x_new = sm.add_constant(x_new)

y_new = results.predict(x_new)
print('predicted y:',y_new,sep='\n')