### Sklearn Linear Regression Model

In [15]:
import numpy as np
import pandas as pd

import scipy.stats as stats

import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline

In [16]:

df = pd.read_csv(r"C:\Users\divyakamat\python_git_repositories\linear_regression\Milage Prediction\dataset\Car-mpg-processed.csv")

In [17]:
def intial_analysis(dataframe):
    
    # Enclosing the string within '\033[1m' and '\033[0m', displays the text in bold
    print ('\033[1m' + "Display first few rows of the data frame:"+'\033[0m')
    print(dataframe.head(3))
    
    print('\033[1m' + "\nDisplay the shape (columns and rows) of the dataset:" +'\033[0m' )
    print("\tcolumns : {}\n\trows : {}".format(dataframe.shape[0],dataframe.shape[1]))
    
    print('\033[1m' + "\nInformation about the dataset:" +'\033[0m')
    dataframe.info()
    
    print('\033[1m' + "\nDetails on Numerical and Categorical features within dataset:\n" + '\033[0m')
    #list the number of Numerical Features in our dataset.
    numerical_feature_columns = list(df._get_numeric_data().columns)
    print("Numeric Columns:",numerical_feature_columns)
    
    #let's find out the number of Categorical Features in our dataset.
    categorical_feature_columns = list(set(df.columns) - set(df._get_numeric_data().columns))
    print("Categorical Columns:",categorical_feature_columns)
    
    print('\033[1m' + "\nPrint any null values within dataset:\n" + '\033[0m')
    labels = []
    values = []
    for col in dataframe.columns:
        labels.append(col)
        values.append(dataframe[col].isnull().sum())
        if values[-1]!=0:
            print(col, values[-1])

intial_analysis(df)  

[1mDisplay first few rows of the data frame:[0m
    mpg  cylinders  displacement  horsepower  weight  acceleration  \
0  18.0          8         307.0       130.0    3504          12.0   
1  15.0          8         350.0       165.0    3693          11.5   
2  18.0          8         318.0       150.0    3436          11.0   

   model_year  car_type  origin_america  origin_asia  origin_europe  
0          70         0               1            0              0  
1          70         0               1            0              0  
2          70         0               1            0              0  
[1m
Display the shape (columns and rows) of the dataset:[0m
	columns : 398
	rows : 11
[1m
Information about the dataset:[0m
<class 'pandas.core.frame.DataFrame'>
RangeIndex: 398 entries, 0 to 397
Data columns (total 11 columns):
mpg               398 non-null float64
cylinders         398 non-null int64
displacement      398 non-null float64
horsepower        398 non-null float64
we

### Model Creation

In [18]:
df_train = df.copy()
# Copy all the predictor variables into X dataframe. Since 'mpg' is dependent variable drop it
X = df_train.drop('mpg', axis=1)

# Copy the 'mpg' column alone into the y dataframe. This is the dependent variable
y = df_train[['mpg']]


In [19]:
#Let us break the X and y dataframes into training set and test set. For this we will use
#Sklearn package's data splitting function which is based on random function

from sklearn.model_selection import train_test_split

# Split X and y into training and test set in 75:25 ratio

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.30, random_state=1)

In [20]:
# Import Linear Regression machine learning library
from sklearn.linear_model import LinearRegression
# invoke the LinearRegression function and find the bestfit model on training data

regression_model = LinearRegression()
regression_model.fit(X_train, y_train)


LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False)

In [21]:
# Let us explore the coefficients for each of the independent attributes

for idx, col_name in enumerate(X_train.columns):
    print("The coefficient for {} is {}".format(col_name, regression_model.coef_[0][idx]))
    
# Let us check the intercept for the model

intercept = regression_model.intercept_[0]

print("\nThe intercept for our model is {}".format(intercept))

The coefficient for cylinders is 1.4750732919168072
The coefficient for displacement is 0.024349322002431513
The coefficient for horsepower is -0.046861910393252465
The coefficient for weight is -0.0065641632962784545
The coefficient for acceleration is 0.041702279188102584
The coefficient for model_year is 0.7938975478842015
The coefficient for car_type is 5.966597439861032
The coefficient for origin_america is -1.4243481095472323
The coefficient for origin_asia is 0.6521119249598665
The coefficient for origin_europe is 0.7722361845873491

The intercept for our model is -28.235395568497534


#we can write our linear model as:
#Y=−28.23+1.47×X1+0.03×X2–0.02×X3–0.01×X4+0.12×X5+0.85×X6–1.90×X7+0.74×X8+1.16×X9

In [22]:
# Model score - R2 or coeff of determinant
# R^2=1–RSS / TSS

regression_model.score(X_test, y_test)

0.8513421387780064