## More info here: https://stackoverflow.com/questions/63536907/different-linearregression-predictions-with-scipy-sparse-and-numpy-ndarray-input

In [1]:
import os
import sys
import numpy as np
import pandas as pd
import matplotlib as mpl
import matplotlib.pyplot as plt
import seaborn as sns

# To reduce output size while working with vscode
%config InlineBackend.figure_format = 'png'

%matplotlib inline

# Display all columns
pd.options.display.max_columns = None

FIGURES_PATH = "plots/"
DATASETS_PATH = "datasets/"

def save_fig(name, extension="png", resolution=300):
    os.makedirs(FIGURES_PATH, exist_ok=True)
    path = os.path.join(FIGURES_PATH, name + "." + extension)
    # print("Saving figure", name)
    plt.tight_layout()
    plt.savefig(path, format=extension, dpi=resolution)

np.random.seed(42)

In [2]:
AUDI_DATASET_PATH = os.path.join(DATASETS_PATH, "audi.csv")
audi_orig = pd.read_csv(AUDI_DATASET_PATH)
audi_orig

Unnamed: 0,model,year,price,transmission,mileage,fuelType,tax,mpg,engineSize
0,A1,2017,12500,Manual,15735,Petrol,150,55.4,1.4
1,A6,2016,16500,Automatic,36203,Diesel,20,64.2,2.0
2,A1,2016,11000,Manual,29946,Petrol,30,55.4,1.4
3,A4,2017,16800,Automatic,25952,Diesel,145,67.3,2.0
4,A3,2019,17300,Manual,1998,Petrol,145,49.6,1.0
...,...,...,...,...,...,...,...,...,...
10663,A3,2020,16999,Manual,4018,Petrol,145,49.6,1.0
10664,A3,2020,16999,Manual,1978,Petrol,150,49.6,1.0
10665,A3,2020,17199,Manual,609,Petrol,150,49.6,1.0
10666,Q3,2017,19499,Automatic,8646,Petrol,150,47.9,1.4


## scipy.sparse

In [9]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.metrics import r2_score

# Splitting to test and train
X = audi_orig.drop("price", axis=1)
y = audi_orig[["price"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Numerical attributes pipeline
num_pipeline = Pipeline([ ("scaler", StandardScaler()) ])

# Categorical attributes pipeline
cat_pipeline = Pipeline([ ("encoder", OneHotEncoder(handle_unknown="ignore")) ])

# Complete pipeline
full_pipeline = ColumnTransformer([
    ("cat", cat_pipeline, ["model", "transmission", "fuelType"]),
    ("num", num_pipeline, ["year", "mileage", "tax", "mpg", "engineSize"]),
])

In [10]:
X_train = full_pipeline.fit_transform(X_train)
X_test = full_pipeline.transform(X_test)

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression().fit(X_train, y_train)
pred = lin_reg.predict(X_test)

r2_score(y_test, pred) # 0.896044623680753 OK

0.896044623680753

## numpy.ndarray

In [11]:
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LinearRegression
from sklearn.metrics import r2_score

# Splitting to test and train
X = audi_orig.drop("price", axis=1)
y = audi_orig[["price"]]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Numerical attributes pipeline
num_pipeline = Pipeline([ ("scaler", StandardScaler()) ])

# Categorical attributes pipeline
cat_pipeline = Pipeline([ ("encoder", OneHotEncoder(handle_unknown="ignore")) ])

# Complete pipeline
full_pipeline = ColumnTransformer([
    ("cat", cat_pipeline, ["model", "transmission", "fuelType"]),
    ("num", num_pipeline, ["year", "mileage", "tax", "mpg", "engineSize"]),
])

In [12]:
X_train = full_pipeline.fit_transform(X_train).toarray()
X_test = full_pipeline.transform(X_test).toarray()

from sklearn.linear_model import LinearRegression
lin_reg = LinearRegression().fit(X_train, y_train)
pred = lin_reg.predict(X_test)
    
r2_score(y_test, pred) # -7.919935999010152e+19 Something is wrong

-7.919935999010152e+19

In [3]:
import sklearn
sklearn.__version__

'0.23.2'