In [None]:
# Load libraries
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import statsmodels.api as sm
import codecademylib3

# Read in the data
df = pd.read_csv('codecademy.csv')

# Print the first five rows
print(df.head())

# Create a scatter plot of score vs completed
plt.scatter(df.completed,df.score)

# Show then clear plot
plt.show()
plt.clf()

# Fit a linear regression to predict score based on prior lessons completed
model = sm.OLS.from_formula('score ~ completed', df)
results = model.fit()
params = results.params
print(params)

# Plot the scatter plot with the line on top
plt.scatter(df.completed,df.score)
plt.plot([0,max(df.completed)],[params[0],max(df.completed)*params[1]+params[0]])

# Show then clear plot
plt.show()
plt.clf()

# Predict score for learner who has completed 20 prior lessons
new_data = {'completed':[20]}
print(results.predict(new_data))

# Calculate fitted values
fitted_values = results.predict(df.completed)

# Calculate residuals
residuals = df.score- fitted_values

# Check normality assumption
plt.hist(fitted_values)

# Show then clear the plot
plt.show()
plt.clf()

# Check homoscedasticity assumption
plt.scatter(residuals, fitted_values)

# Show then clear the plot
plt.show()
plt.clf()

# Create a boxplot of score vs lesson
sns.boxplot(data = df, x=df.lesson, y=df.score)

# Show then clear plot
plt.show()
plt.clf()

# Fit a linear regression to predict score based on which lesson they took
model2 = sm.OLS.from_formula('score ~ lesson', df)
results2 = model2.fit()
params2 = results.params
print(params2)

# Calculate and print the group means and mean difference (for comparison)
means2 = df.groupby('lesson').mean().score
print(means2)


# Use `sns.lmplot()` to plot `score` vs. `completed` colored by `lesson`

sns.lmplot(x='completed', y='score', hue='lesson', data = df)
plt.show()
