# Regresión Lineal Múltiple


# Importar las librerías


In [115]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd

# Importar el data set


Este conjunto de datos tiene datos recopilados de Nueva York, California y Florida sobre 50 startups, 17 en cada estado. Las variables utilizadas en el conjunto de datos son  el beneficio, Gasto en I+D, Gasto en administración y Gasto en marketing. La variable objetivo a predecir en este caso va a ser el beneficio ("Profit").

In [None]:
dataset = pd.read_csv('../data/50_Startups.csv')

In [119]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,State,Profit
0,165349.2,136897.8,471784.1,New York,192261.83
1,162597.7,151377.59,443898.53,California,191792.06
2,153441.51,101145.55,407934.54,Florida,191050.39
3,144372.41,118671.85,383199.62,New York,182901.99
4,142107.34,91391.77,366168.42,Florida,166187.94


# Codificar datos categóricos

In [120]:
dataset_dummies = pd.get_dummies(dataset)

In [121]:
dataset_dummies.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


# Podemos quedarnos con dos de las variables onehot, no necesitamos las tres


In [122]:
dataset = pd.get_dummies(dataset, drop_first = True)

In [123]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


# Dividir el data set en conjunto de entrenamiento y conjunto de testing


In [124]:
# Variables que utilizaremos para predecir la variable objetivo
X = dataset.iloc[:, [0,1,2,4,5]].values
# Variable objetivo
y = dataset['Profit']

In [125]:
from sklearn.model_selection import train_test_split

In [126]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Escalado de variables

In [127]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


# Ajustar el modelo de Regresión lineal múltiple con el conjunto de entrenamiento


In [128]:
from sklearn.linear_model import LinearRegression

In [129]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [130]:
regression.coef_

array([38584.38113487,   777.8870992 ,  4047.71944397,   185.21159662,
         147.27510027])

In [131]:
regression.intercept_

110225.32057142857

# Predicción de los resultados en el conjunto de testing


In [132]:
y_pred = regression.predict(X_test)

## Calculamos el R2 score o coeficiente de determinación

In [133]:
# Sacamos el coeficiente de determinación
from sklearn.metrics import r2_score

r2 = r2_score(y_pred = y_pred, y_true = y_test)
r2

0.9358680970046519

In [134]:
# Calculamos el coeficiente de R cuadrado ajustado
def r2_adjusted(y_pred, X_test, r2):    
    N=y_pred.shape[0]
    p= X_test.shape[1]
    x = (1-r2)
    y = (N-1) / (N-p-1)
    adj_rsquared = (1 - (x * y))
    return adj_rsquared

In [135]:
print("Adjusted-R2 : " , r2_adjusted(y_pred, X_test, r2))

Adjusted-R2 :  0.9002392620072363


## Intenta mejorar los resultados seleccionando solo algunas de las variables. Por ejemplo, emplea el método de selección hacia atrás visto en la clase anterior.

Probamos quitando una variable:

In [136]:
dataset = pd.read_csv('50_Startups.csv')
# Variables que utilizaremos para predecir la variable objetivo
X = dataset.iloc[:, :-2].values
# Variable objetivo
y = dataset.iloc[:, 4].values

# Dividir el data set en conjunto de entrenamiento y conjunto de testing


In [141]:
from sklearn.model_selection import train_test_split

In [142]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

# Escalado de variables

In [143]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


# Ajustar el modelo de Regresión lineal múltiple con el conjunto de entrenamiento


In [144]:
from sklearn.linear_model import LinearRegression

In [145]:
regression = LinearRegression()
regression.fit(X_train, y_train)

# Predicción de los resultados en el conjunto de testing


In [146]:
y_pred = regression.predict(X_test)

## Calculamos el R2 score o coeficiente de determinación

In [147]:
r2 = regression.score(X_test, y_test)
r2

0.9355188337118217

In [148]:
print("Adjusted-R2 : " , r2_adjusted(y_pred, X_test, r2))

Adjusted-R2 :  0.917933061087773


Probamos quitando dos variables:

In [149]:
dataset = pd.read_csv('50_Startups.csv')
# Variables que utilizaremos para predecir la variable objetivo
X = dataset.iloc[:, :-3].values
# Variable objetivo
y = dataset.iloc[:, 4].values

# Dividir el data set en conjunto de entrenamiento y conjunto de testing


In [150]:
from sklearn.model_selection import train_test_split

In [151]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Escalado de variables

In [152]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


# Ajustar el modelo de Regresión lineal múltiple con el conjunto de entrenamiento


In [153]:
from sklearn.linear_model import LinearRegression

In [154]:
regression = LinearRegression()
regression.fit(X_train, y_train)

# Predicción de los resultados en el conjunto de testing


In [155]:
y_pred = regression.predict(X_test)

## Calculamos el R2 score o coeficiente de determinación

In [156]:
r2 = regression.score(X_test, y_test)
r2

0.9469407189577184

In [157]:
print("Adjusted-R2 : " , r2_adjusted(y_pred, X_test, r2))

Adjusted-R2 :  0.9317809243742093


## Selección de variables hacia atrás

In [158]:
dataset = pd.read_csv('50_Startups.csv')

In [159]:
dataset = pd.get_dummies(dataset, drop_first = True)

In [167]:
# Variables que utilizaremos para predecir la variable objetivo
X = dataset.iloc[:, [0,1,2,4,5]]
# Variable objetivo
y = dataset['Profit']

In [168]:
from sklearn.model_selection import train_test_split

In [169]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [170]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [173]:
from mlxtend.feature_selection import SequentialFeatureSelector as SFS
sbs = SFS(LinearRegression(),k_features=2,forward=False)
sbs.fit(np.array(X_train), y_train)

In [None]:
!pip install mlxtend

In [172]:
sbs.k_feature_names_

('0', '2')

In [174]:
[X.columns[int(index)] for index in list(sbs.k_feature_idx_)]

['R&D Spend', 'Marketing Spend']

In [175]:
X = dataset[[X.columns[int(index)] for index in list(sbs.k_feature_idx_)]]
y = dataset['Profit']

In [176]:
from sklearn.model_selection import train_test_split

In [177]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.3, random_state = 0)

In [178]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [179]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [180]:
y_pred = regression.predict(X_test)

In [181]:
r2 = regression.score(X_test, y_test)
r2

0.9431305015271918

In [182]:
print("Adjusted-R2 : " , r2_adjusted(y_pred, X_test, r2))

Adjusted-R2 :  0.9336522517817237


## Selección de variables automatizada

In [183]:
dataset = pd.read_csv('50_Startups.csv')

# Codificar datos categóricos

In [184]:
dataset_dummies = pd.get_dummies(dataset)

In [185]:
dataset_dummies.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_California,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,0,1
1,162597.7,151377.59,443898.53,191792.06,1,0,0
2,153441.51,101145.55,407934.54,191050.39,0,1,0
3,144372.41,118671.85,383199.62,182901.99,0,0,1
4,142107.34,91391.77,366168.42,166187.94,0,1,0


# Podemos quedarnos con dos de las variables onehot, no necesitamos las tres


In [186]:
dataset = pd.get_dummies(dataset, drop_first = True)

In [187]:
dataset.head()

Unnamed: 0,R&D Spend,Administration,Marketing Spend,Profit,State_Florida,State_New York
0,165349.2,136897.8,471784.1,192261.83,0,1
1,162597.7,151377.59,443898.53,191792.06,0,0
2,153441.51,101145.55,407934.54,191050.39,1,0
3,144372.41,118671.85,383199.62,182901.99,0,1
4,142107.34,91391.77,366168.42,166187.94,1,0


# Dividir el data set en conjunto de entrenamiento y conjunto de testing


In [188]:
# Variables que utilizaremos para predecir la variable objetivo
X = dataset.iloc[:, [0,1,2,4,5]].values
# Variable objetivo
y = dataset['Profit']

In [189]:
from sklearn.model_selection import train_test_split

In [190]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

# Escalado de variables

In [191]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)


In [192]:
X_train_df = pd.DataFrame(X_train, columns = dataset.drop('Profit', axis = 1).columns)
X_test_df = pd.DataFrame(X_test, columns = dataset.drop('Profit', axis = 1).columns)

In [193]:
from sklearn.metrics import r2_score
dataset_seleccion = pd.DataFrame()
for i in range(1, len(X_train_df.columns)):
    sbs = SFS(LinearRegression(),k_features=i,forward=False,floating=False)
    sbs.fit(np.array(X_train), y_train)
#    sbs.k_feature_names_
    regression = LinearRegression()
    regression.fit(X_train_df[[X_train_df.columns[int(index)] for index in list(sbs.k_feature_idx_)]], 
                   y_train)
    y_pred = regression.predict(X_test_df[[X_train_df.columns[int(index)] for index in list(sbs.k_feature_idx_)]])
    r2 = r2_score(y_test, y_pred)
    dataset_seleccion = pd.concat([dataset_seleccion,pd.DataFrame({
        "variables":[[X_train_df.columns[int(index)] for index in list(sbs.k_feature_idx_)]], 
        "r2": r2})],ignore_index = True)


In [194]:
dataset_seleccion.sort_values(by = "r2", ascending = False, ignore_index = True)

Unnamed: 0,variables,r2
0,"[R&D Spend, Marketing Spend]",0.947439
1,[R&D Spend],0.946459
2,"[R&D Spend, Administration, Marketing Spend]",0.939396
3,"[R&D Spend, Administration, Marketing Spend, S...",0.936703


In [66]:
X = dataset[dataset_seleccion.sort_values(by = "r2", ascending = False, ignore_index = True)["variables"][0]]
y = dataset['Profit']

In [67]:
from sklearn.model_selection import train_test_split

In [68]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)

In [69]:
from sklearn.preprocessing import StandardScaler
sc_X = StandardScaler()
X_train = sc_X.fit_transform(X_train)
X_test = sc_X.transform(X_test)

In [70]:
regression = LinearRegression()
regression.fit(X_train, y_train)

In [71]:
y_pred = regression.predict(X_test)

In [72]:
r2 = regression.score(X_test, y_test)
r2

0.9474386447268488

In [73]:
print("Adjusted-R2 : " , r2_adjusted(y_pred, X_test, r2))

Adjusted-R2 :  0.9324211146488056
