In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [None]:
pwd

In [None]:
# Importing height weight csv
df = pd.read_csv('C:\\Users\\hp\\Downloads\\Linear_Regression/Height_Weight.csv')
df

In [None]:
#Observing first 5 rows in the dataset df
df.head()

In [None]:
# To find the datatypes of each column in df
df.info()

In [None]:
# Let's look at some statistical information about our dataframe.
df.describe() 

In [None]:
#Statistical Information about Categorical Columns
df.describe(include='object')

In [None]:
#Dropping "id" column as it is unique identification for each row
df.drop(['id'], axis = 1,inplace=True)
df

In [None]:
# Finding missing values in df
df.isnull()

In [None]:
#Finding the sum of missing values 'columnwise'
df.isnull().sum()

Univariate Analysis

In [None]:
#Frequency Matrix of 'Gender' column
df['gender'].value_counts()

In [None]:
#Bar chart of frequency count (for Male & Female)
sns.countplot(x = 'gender' , data = df)

In [None]:
#Looking at the scatter plot between "weight" & 'height'
sns.scatterplot(df['weight'],df['height'])

In [None]:
#Finding Correlation Coeficient betweeen weight and height
df.corr()

In [None]:
#Distribution Plot for "weight"
sns.displot(df['weight'],kind='hist')

In [None]:
#Distribution Plot for "height" column
sns.displot(df.height,kind='hist')

In [None]:
#Skew in the distribution of 'weight' column
df['weight'].skew() #Moderately Skewed

In [None]:
#Skew in the distribution of "height" column
df['height'].skew() #Almost Symmetric

In [None]:
#Outlier Detection of "height" column
sns.boxplot(df['height'])

In [None]:
#Outlier Detection of 'weight' column
sns.boxplot(df.weight)

In [None]:
#Assigning Independent Variable "height" as 'X'
X=df['height']
X

In [None]:
#Assigning Dependent Variable "weight" as 'y'
y=df['weight']
y

Plotting Best Fit Line Using np.polyfit

In [None]:
#To Plot a Linear Regression (Best Fit) line on a scatter plot, using np.polyfit()
import numpy as np
plt.scatter(X,y)
m,b = np.polyfit(X, y, 1) #m = slope, b=intercept
plt.plot(X, m*X + b,'r') #We plot X & equation (m*X+b)

Creation of dummy variables

In [None]:
#Creating list of numeric columns
df.select_dtypes(include=[np.number]).columns.tolist()

In [None]:
#Creating df of Numeric columns
df_num = df[['weight','height']]
df_num

In [None]:
#Create a list of categorical variables
df.select_dtypes(include=['object']).columns.tolist()

In [None]:
#Convert "gender" column to "numeric" data using get_dummies
df_dummies = pd.get_dummies(df['gender'])

In [None]:
df_dummies.tail()

In [None]:
#Combined column wise, appending of columns
df_combined = pd.concat([df_num, df_dummies], axis=1)

In [None]:
#Modified Data set 
df_combined.head()

In [None]:
#Find Missing Values
df_combined.isnull().sum(axis = 0)

In [None]:
# Putting feature variable to X (ie X = Dataset after removing Interest Rate)
X = df_combined.drop('weight', axis= 1)

In [None]:
# Putting response variable to y
y = df_combined['weight']

In [None]:
X 

In [None]:
y

Splitting the data in Training and Test set
Using sklearn we split 70% of our data into training set and rest in test set.
Setting random_state will give the same training and test set everytime on running the code.

In [None]:
#Split the Data 
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, train_size=0.7 , random_state=12)
X_train.shape,y_train.shape,X_test.shape,y_test.shape

Performing Linear Regression

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
# Representing LinearRegression as lm
lm = LinearRegression()
lm

In [None]:
# fit the model to the training data
lm.fit(X_train,y_train)

Step 5 : Model Evaluation

In [None]:
# print the intercept
print(lm.intercept_)

In [None]:
# print the intercept
print(lm.coef_)

In [None]:
# Let's see the coefficient
coeff_df = pd.DataFrame(lm.coef_,X_test.columns,columns=['Coefficient'])
coeff_df

In [None]:
sns.scatterplot(df_combined['F'],df_combined['weight'])

In [None]:
# Making predictions using the model
y_pred = lm.predict(X_test)
print(y_pred)

In [None]:
#Model Performance Metrics
#Coefficient of Determination (R square)

from sklearn.metrics import mean_squared_error, r2_score
mse = mean_squared_error(y_test, y_pred)
r_squared = r2_score(y_test, y_pred)
r_squared



In [None]:
from math import sqrt
rmse = sqrt(mse)
print('Mean_Squared_Error :' ,mse)
print('Root_Mean_Squared_Error :' ,rmse)
print('r_square_value :',r_squared)

In [None]:
df1 = pd.DataFrame({'Actual': y_test, 'Predicted': y_pred})
df1