# Multiple Linear Regression

## Importing the libraries

In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

## Importing the dataset

In [None]:
dataset = pd.read_csv('drive/MyDrive/IDM Challenge 2/Training.csv')
testing = pd.read_csv('drive/MyDrive/IDM Challenge 2/Testing.csv')
X = dataset.iloc[:, :-1]
y = dataset.iloc[:, -1]

In [None]:
del X['ID']
del testing['ID']

In [None]:
X.shape

(100000, 106)

In [None]:
y.shape

(100000,)

In [None]:
testing.shape

(30000, 106)

## Removing Null Values

In [None]:
testing.isnull().sum().sum()

69118

In [None]:
null_cols = X.columns[X.isnull().any()]

In [None]:
for column in null_cols:
    X[column] = X[column].fillna(X[column].mean())
    testing[column] = X[column].fillna(X[column].mean())

In [None]:
X.isnull().sum().sum()

0

In [None]:
testing.isnull().sum().sum()

0

## Encoding the categorical varaibles

In [None]:
len(X['sub_area'].unique())

108

In [None]:
cols = X.columns

In [None]:
num_cols = X._get_numeric_data().columns

In [None]:
cat_cols = list(set(cols) - set(num_cols))

In [None]:
cat_col_index = []
for col in cat_cols:
    col_index = X.columns.get_loc(col)
    cat_col_index.append(col_index)

In [None]:
numeric_col_index = []
for col in num_cols:
    col_index = X.columns.get_loc(col)
    numeric_col_index.append(col_index)

In [None]:
cat_col_index

[57, 43, 2, 21, 46, 16, 20, 23, 49, 14, 17, 19, 18, 1, 22]

In [None]:
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, LabelEncoder
ct_1 = ColumnTransformer(transformers=[('encoder', OneHotEncoder(), cat_col_index)], remainder='passthrough'
                        , sparse_threshold=0)
X = ct_1.fit_transform(X)
testing = ct_1.transform(testing)

In [None]:
X.shape

(100000, 230)

In [None]:
testing.shape

(30000, 230)

## Training the Multiple Linear Regression model on the Training set

In [None]:
from sklearn.linear_model import LinearRegression
regressor = LinearRegression()
regressor.fit(X, y)

LinearRegression()

## Training the Random Forest Regression model on the Training set

In [None]:
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import SequentialFeatureSelector as sfs
regressor = RandomForestRegressor(n_estimators = 15, random_state = 0, verbose=3)
regressor.fit(X, y)

[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1 concurrent workers.


building tree 1 of 15


[Parallel(n_jobs=1)]: Done   1 out of   1 | elapsed:    8.9s remaining:    0.0s


building tree 2 of 15


[Parallel(n_jobs=1)]: Done   2 out of   2 | elapsed:   17.6s remaining:    0.0s


building tree 3 of 15
building tree 4 of 15
building tree 5 of 15
building tree 6 of 15
building tree 7 of 15
building tree 8 of 15
building tree 9 of 15
building tree 10 of 15
building tree 11 of 15
building tree 12 of 15
building tree 13 of 15
building tree 14 of 15
building tree 15 of 15


[Parallel(n_jobs=1)]: Done  15 out of  15 | elapsed:  2.3min finished


RandomForestRegressor(n_estimators=15, random_state=0, verbose=3)

## Making a CSV file of the results

In [None]:
final_pred = regressor.predict(testing)

In [None]:
temp = pd.read_csv('drive/MyDrive/IDM Challenge 2/Testing.csv')
frame = { 'ID': temp['ID'], 'price_doc': final_pred}
result = pd.DataFrame(frame)

In [None]:
result.to_csv('drive/MyDrive/IDM Challenge 2/entry16.csv', index=False)