In [None]:
import pandas as pd
import matplotlib as plt
import seaborn as sns
import random

In [None]:
# Let us start with a simple dataset
x = [i for i in range(0,22, 2)]
y = [4+0.8*i+random.random() for i in range(22, 0, -2) ]

# Dataframe
df = pd.DataFrame(
    {'x': x,
     'y': y}
)

In [None]:
df.head()

In [None]:
sns.scatterplot(x='x', y = 'y', data=df)

Let us look at the correlation between x and y. Pandas has a corr method that allows to find the correlation between two columns. For demo puposes, we created the dataset to be highly negatively correlated. Therefore, we should expect a number close to -1, as we are using Pearson's correlation.  

In [None]:
df.x.corr(df.y, method='pearson')

In [None]:
# The mean of x and y
x_mean = df.x.mean()
y_mean = df.y.mean()


df['xycov'] = (df['x'] - x_mean) * (df['y'] - y_mean)
df['xvar'] = (df['x'] - x_mean)**2

# Calculate the slope and intercept
m = df['xycov'].sum() / df['xvar'].sum()
c = y_mean - (m * x_mean)
print('c = ', c)
print ('m = ', m)
print ('line: ', 'y = '+str(round(c, 3))+'x + '+str(round(m,3)))

In [None]:
# Now we can use the line to predict y
df['y_pred'] = m*df['x']+c

In [None]:
df

In [None]:
# Lets plot the line
sns.lineplot(x='x', y = 'y_pred', data=df, color='red')
sns.scatterplot(x='x', y = 'y', data=df)

In [None]:
# Let us plot both y and y_pred to have a visual sense of how we did
df.plot(kind='bar', x='x', y=['y', 'y_pred'], figsize=(20, 5))

In [None]:
# Calculate the coefficient of determination or R squared
# The coefficient of determination is the proportion of the variance in the dependent variable that is predictable from the independent variable.
# It ranges from 0 to 1.

1 - ((df['y'] - df['y_pred'])).var()/df['y'].var()

Here the predictability is really good.

In [None]:
# If we wanted to predict what y would be when a new point x=11 is given, we could
m*11+c

Lets try out a real example

In [None]:
from google.colab import drive
drive.mount('/content/drive')

In [None]:
path = ""

In [None]:
df = pd.read_csv(path)

In [None]:
df.head()

In [None]:
df.tail()

Let us remove Achham and Udayapur for now. We will use the model we created to predict the proverty rate of Accham

In [None]:
df = df[1:-1]

In [None]:
df.tail()

In [None]:
sns.scatterplot(x='literacy rate', y = 'poverty rate', data=df)

In [None]:
df['literacy rate'].corr(df['poverty rate'], method='pearson')

In [None]:
x_mean = df['literacy rate'].mean()
y_mean = df['poverty rate'].mean()


df['xycov'] = (df['literacy rate'] - x_mean) * (df['poverty rate'] - y_mean)
df['xvar'] = (df['literacy rate'] - x_mean)**2

# Calculate the slope and intercept
m = df['xycov'].sum() / df['xvar'].sum()
c = y_mean - (m * x_mean)
print('c = ', c)
print ('m = ', m)
print ('line: ', 'y = '+str(round(c, 3))+'x + '+str(round(m,3)))

In [None]:
df['y_pred'] = m*df['literacy rate']+c

In [None]:
sns.lineplot(x='literacy rate', y = 'y_pred', data=df, color='red')
sns.scatterplot(x='literacy rate', y = 'poverty rate', data=df)

For Achham, the predicted poverty rate is

In [None]:
round(m*0.476151+c , 3)

But, we know that the actual poverty rate was 0.472

For Udaypur, the predicted poverty rate is

In [None]:
round (m*0.614868+c, 3)

But, we know that the actual poverty rate was 0.259

In [None]:
# coefficient of determination
(1 - ((df['poverty rate'] - df['y_pred'])).var()/df['poverty rate'].var())

The accuracy or score of Linear Regression is tied to how well the data is correlated with each other. The coefficient of dertermination (R^2) for Linear Regression is the square of correlation.   