<a href="https://colab.research.google.com/github/franciscosalido/Multiple_Regression/blob/master/Multiple_Regression.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

# Multiple Regression

## Libraries

In [0]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
sns.set()
import statsmodels as sm
from statsmodels.stats.outliers_influence import variance_inflation_factor
from sklearn.model_selection import train_test_split, GridSearchCV, RandomizedSearchCV
from sklearn.linear_model import LinearRegression, Ridge, Lasso
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor, GradientBoostingRegressor
from sklearn.metrics import r2_score, mean_squared_error
from sklearn import preprocessing


## Data Preprocessing

Loading Data File

In [0]:
file1 = 'https://raw.githubusercontent.com/prince381/car_mpg_predict/master/cars1.csv'
file2 = 'https://raw.githubusercontent.com/prince381/car_mpg_predict/master/cars2.csv'

Reading Data File

In [0]:
cars1 = pd.read_csv(file1)
cars2 = pd.read_csv(file2)

Cleaning the Data

In [0]:
cars1.drop(cars1.columns[9:], axis=1, inplace=True)

Concatenate the two variables

In [0]:
cars = pd.concat([cars1, cars2])

Print the head of the new data variable

In [18]:
cars.head()

Unnamed: 0,mpg,cylinders,displacement,horsepower,weight,acceleration,model,origin,car
0,18.0,8,307,130,3504,12.0,70,1,chevrolet chevelle malibu
1,15.0,8,350,165,3693,11.5,70,1,buick skylark 320
2,18.0,8,318,150,3436,11.0,70,1,plymouth satellite
3,16.0,8,304,150,3433,12.0,70,1,amc rebel sst
4,17.0,8,302,140,3449,10.5,70,1,ford torino


Print the info of the new data variable

In [19]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 398 entries, 0 to 199
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null int64
horsepower      398 non-null object
weight          398 non-null int64
acceleration    398 non-null float64
model           398 non-null int64
origin          398 non-null int64
car             398 non-null object
dtypes: float64(2), int64(5), object(2)
memory usage: 31.1+ KB


## Treating the Data

Print the Horsepower column unique values

In [20]:
cars.horsepower.unique()

array(['130', '165', '150', '140', '198', '220', '215', '225', '190',
       '170', '160', '95', '97', '85', '88', '46', '87', '90', '113',
       '200', '210', '193', '?', '100', '105', '175', '153', '180', '110',
       '72', '86', '70', '76', '65', '69', '60', '80', '54', '208', '155',
       '112', '92', '145', '137', '158', '167', '94', '107', '230', '49',
       '75', '91', '122', '67', '83', '78', '52', '61', '93', '148',
       '129', '96', '71', '98', '115', '53', '81', '79', '120', '152',
       '102', '108', '68', '58', '149', '89', '63', '48', '66', '139',
       '103', '125', '133', '138', '135', '142', '77', '62', '132', '84',
       '64', '74', '116', '82'], dtype=object)

Replace '?' with 'NaN' values 

In [0]:
cars.horsepower = cars.horsepower.str.replace('?', 'NaN').astype(float)

Fill the missing data with mean value of horsepower

In [0]:
cars.horsepower.fillna(cars.horsepower.mean(), inplace=True)

Change the data type to integer

In [0]:
cars.horsepower = cars.horsepower.astype(int)

Print the info of the data again

In [25]:
cars.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 398 entries, 0 to 199
Data columns (total 9 columns):
mpg             398 non-null float64
cylinders       398 non-null int64
displacement    398 non-null int64
horsepower      398 non-null int64
weight          398 non-null int64
acceleration    398 non-null float64
model           398 non-null int64
origin          398 non-null int64
car             398 non-null object
dtypes: float64(2), int64(6), object(1)
memory usage: 31.1+ KB


# Cleaning the Data