<a href="https://colab.research.google.com/github/eckoecho/CodingDojo/blob/Model/Linear_Regression_in_Python.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

#Importing necessary libraries



In [37]:
# Import standard packages
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
# Import modeling tools
from sklearn import set_config
set_config(transform_output="pandas")
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder, OrdinalEncoder, StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline, make_pipeline
from sklearn.compose import ColumnTransformer

#Load Data

In [38]:
fpath="/content/drive/MyDrive/CodingDojo/02-MachineLearning/Week06/Data/galton-height-raw.csv"
df = pd.read_csv(fpath)
df.info()
df.head()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 898 entries, 0 to 897
Data columns (total 6 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   family  898 non-null    object 
 1   father  898 non-null    float64
 2   mother  898 non-null    float64
 3   gender  898 non-null    object 
 4   height  898 non-null    float64
 5   kids    898 non-null    int64  
dtypes: float64(3), int64(1), object(2)
memory usage: 42.2+ KB


Unnamed: 0,family,father,mother,gender,height,kids
0,1,78.5,67.0,M,73.2,4
1,1,78.5,67.0,F,69.2,4
2,1,78.5,67.0,F,69.0,4
3,1,78.5,67.0,F,69.0,4
4,2,75.5,66.5,M,73.5,4


In [39]:
# check for duplicate rows
df.duplicated().sum()

112

Since its such a small data set we are choosing to leave in the duplicates.

In [40]:
#Checking for null
df.isna().sum()

family    0
father    0
mother    0
gender    0
height    0
kids      0
dtype: int64

In [41]:
# Checking nuniuqe categories
df.select_dtypes("object").nunique()

family    197
gender      2
dtype: int64

There are two categorical features, but family has high cardinality; therefore, we will drop it.



In [42]:
df = df.drop(columns="family")
df.head()

Unnamed: 0,father,mother,gender,height,kids
0,78.5,67.0,M,73.2,4
1,78.5,67.0,F,69.2,4
2,78.5,67.0,F,69.0,4
3,78.5,67.0,F,69.0,4
4,75.5,66.5,M,73.5,4


In [43]:
#Checking for inconsistent data
df["gender"].value_counts()

M    465
F    433
Name: gender, dtype: int64

In [44]:
#Checking for impossible values
df.describe().round(2)

Unnamed: 0,father,mother,height,kids
count,898.0,898.0,898.0,898.0
mean,69.23,64.08,66.76,6.14
std,2.47,2.31,3.58,2.69
min,62.0,58.0,56.0,1.0
25%,68.0,63.0,64.0,4.0
50%,69.0,64.0,66.5,6.0
75%,71.0,65.5,69.7,8.0
max,78.5,70.5,79.0,15.0


#Assign the Target (y) and Features (X)


In [45]:
X = df.drop(columns="height")
y = df["height"]

# Train Test Split

In [46]:
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
X_train.head()

Unnamed: 0,father,mother,gender,kids
377,70.5,62.0,F,8
357,70.5,63.0,F,5
723,67.0,64.0,M,4
306,70.0,64.7,F,7
464,69.0,66.0,F,9


In [47]:
X_train.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 673 entries, 377 to 102
Data columns (total 4 columns):
 #   Column  Non-Null Count  Dtype  
---  ------  --------------  -----  
 0   father  673 non-null    float64
 1   mother  673 non-null    float64
 2   gender  673 non-null    object 
 3   kids    673 non-null    int64  
dtypes: float64(2), int64(1), object(1)
memory usage: 26.3+ KB


#Create the Preprocessor ColumnTransformer


In [48]:
# Get list of numeric columns and instantiate a StandardScaler 
num_cols = X_train.select_dtypes('number').columns
scaler = StandardScaler()
# Construct the tuple for column transformer with the scaler
num_tuple = ('numeric',scaler, num_cols)
num_tuple



('numeric',
 StandardScaler(),
 Index(['father', 'mother', 'kids'], dtype='object'))

In [49]:
# Get list of categorical columns and instantiate a OneHotEncoder
cat_cols = X_train.select_dtypes('object').columns
encoder_ohe = OneHotEncoder(handle_unknown='ignore',sparse_output=False)
# Construct the tuple for column transformer with the encoder
cat_tuple = ('categorical',encoder_ohe, cat_cols)
cat_tuple


('categorical',
 OneHotEncoder(handle_unknown='ignore', sparse_output=False),
 Index(['gender'], dtype='object'))

In [50]:
# Instantiate the preprocessor/ColumnTransformer
preprocessor = ColumnTransformer([num_tuple, cat_tuple],                               
                                 verbose_feature_names_out=False)
preprocessor


#Transform the Data with the Preprocessor


In [51]:
# Fit the preprocessor on training data 
preprocessor.fit(X_train)
# Transform the training and test data
X_train_tf = preprocessor.transform(X_train)
X_test_tf = preprocessor.transform(X_test)
X_train_tf.head()

Unnamed: 0,father,mother,kids,gender_F,gender_M
377,0.513292,-0.880603,0.706629,1.0,0.0
357,0.513292,-0.458911,-0.412339,1.0,0.0
723,-0.882368,-0.037219,-0.785329,0.0,1.0
306,0.313912,0.257965,0.33364,1.0,0.0
464,-0.084848,0.806165,1.079619,1.0,0.0


# Import and instantiate the model

In [52]:
from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression()
lin_reg

#Train the model on your training data.

In [54]:
# Fit the model on the training data
lin_reg.fit(X_train_tf, y_train)

#Use the model to make predictions for training and testing data

In [56]:
# Get predictions for the training data
y_predictions_train = lin_reg.predict(X_train_tf)
# Get predictions for the testing data
y_predictions_test = lin_reg.predict(X_test_tf)

#Evaluate the Results

In [57]:
# Saving a copy of X_test_tf and adding the true and predicted price and the error
prediction_df = X_test_tf.copy()
prediction_df["True Height"] = y_test
prediction_df["Predicted Height"] = y_predictions_test.round(1)
prediction_df["Error"] = (y_predictions_test - y_test).round(1)
prediction_df.head(10)

Unnamed: 0,father,mother,kids,gender_F,gender_M,True Height,Predicted Height,Error
331,0.513292,0.173627,-1.904297,1.0,0.0,60.0,64.9,4.9
638,-0.483608,-0.458911,0.706629,0.0,1.0,65.5,68.4,2.9
326,0.513292,-0.037219,0.706629,0.0,1.0,68.0,69.8,1.8
848,-1.679888,-0.037219,-0.03935,0.0,1.0,67.0,67.5,0.5
39,1.908952,-0.880603,0.706629,1.0,0.0,63.5,65.4,1.9
327,0.513292,-0.037219,0.706629,0.0,1.0,68.0,69.8,1.8
375,0.513292,-0.880603,0.706629,1.0,0.0,65.0,63.9,-1.1
334,0.313912,-0.037219,-0.785329,1.0,0.0,64.0,64.4,0.4
208,0.712672,-0.880603,-0.412339,0.0,1.0,70.0,69.5,-0.5
136,0.712672,0.806165,-0.785329,1.0,0.0,67.0,65.4,-1.6
