# Anscombe's Quartet Dataset

In [None]:
# Import matplotlib.
import matplotlib.pyplot as plt

# Make matplotlib show interactive plots in the notebook.
%matplotlib inline

# Import numpy.
import numpy as np

# Import pandas.
import pandas as pd

# Import mean.
from statistics import mean

In [None]:
# Opens data file to read data.
with open('anscombe.csv') as a:

    # Reads data from csv file as dataframe using pandas library, using the first row as headers.
    Anscombe = pd.read_csv(a, header=0,index_col=0)
    print(Anscombe)


In [None]:
# Calculates slope and intercep of best fit line between any two datasets,
# then calculate the y values of the best fit line for given x values
def best_fit_line(x,y):
    m = (((mean(x)*mean(y)) - mean(x*y)) / ((mean(x)*mean(x)) - mean(x*x)))
    
    b = mean(y) - m*mean(x)
    
    regression_line = [(m*i)+b for i in x]
    
    return regression_line, m, b

# Plot a set of x and y values along with their best fit line, and print the slope and y-intercept of the best fit line
def plot_best_fit (x,y,label1,label2):
    best_fit_y,best_fit_m,best_fit_b=best_fit_line(x,y)
    
    plt.scatter(x,y)
    plt.plot(x, best_fit_y)
    plt.xlabel(label1)
    plt.ylabel(label2)
    plt.show()
    
    print("Best fit slope = ",round(best_fit_m,2))
    print("Best fit y-intercept = ",round(best_fit_b,2))    

In [None]:
# Plot each pair of datasets in the quartet
plot_best_fit(Anscombe['x1'],Anscombe['y1'],'x1','y1')
plot_best_fit(Anscombe['x2'],Anscombe['y2'],'x2','y2')
plot_best_fit(Anscombe['x3'],Anscombe['y3'],'x3','y3')
plot_best_fit(Anscombe['x4'],Anscombe['y4'],'x4','y4')

In [None]:
# Calculate descriptive statistics for each column in the dataset
Anscombe.describe().round(decimals=2)