# Basit Doğrusal Regresyon

## Modelleme 

In [None]:
from warnings import filterwarnings
filterwarnings('ignore')

In [None]:
import pandas as pd
ad = pd.read_csv("Advertising.csv", usecols = [1,2,3,4])
df = ad.copy()
df.head()

In [None]:
#df = df.iloc[:,1:len(df)]

In [None]:
df.head()

In [None]:
df.info()

In [None]:
df.describe().T

In [None]:
df.isnull().values.any()

In [None]:
df.corr()

In [None]:
import seaborn as sns
sns.pairplot(df, kind  ="reg");

In [None]:
sns.jointplot(x = "TV", y = "sales", data = df, kind = "reg")

## Statsmodels ile modelleme

In [None]:
import statsmodels.api as sm

In [None]:
X = df[["TV"]]
X[0:5]

In [None]:
X = sm.add_constant(X)

In [None]:
X[0:5]

In [None]:
y = df["sales"]

In [None]:
y[0:5]

In [None]:
lm = sm.OLS(y,X)

In [None]:
model = lm.fit()

In [None]:
model.summary()

In [None]:
import statsmodels.formula.api as smf
lm = smf.ols("sales ~ TV", df)
model = lm.fit()
model.summary()

In [None]:
model.params

In [None]:
model.summary().tables[1]

In [None]:
model.conf_int()

In [None]:
model.f_pvalue

In [None]:
print("f_pvalue: ", "%.4f" % model.f_pvalue)

In [None]:
print("fvalue: ", "%.2f" % model.fvalue)

In [None]:
print("tvalue: ", "%.2f" % model.tvalues[0:1])

In [None]:
model.rsquared_adj

In [None]:
model.fittedvalues[0:5]

In [None]:
y[0:5]

In [None]:
print("Sales = " +  str("%.2f" % model.params[0]) + " + TV" + "*" + str("%.2f" % model.params[1]))

In [None]:
g = sns.regplot(df["TV"], df["sales"], ci=None, scatter_kws={'color':'r', 's':9})
g.set_title("Model Denklemi: Sales = 7.03 + TV*0.05")
g.set_ylabel("Satış Sayısı")
g.set_xlabel("TV Harcamaları")
import matplotlib.pyplot as plt
plt.xlim(-10,310)
plt.ylim(bottom=0);

In [None]:
from sklearn.linear_model import LinearRegression

In [None]:
X = df[["TV"]]
y = df["sales"]
reg = LinearRegression()
model = reg.fit(X, y)
model.intercept_
model.coef_

In [None]:
model.score(X,y)

In [None]:
model.predict(X)[0:10]

## Tahmin

Model denklemi:

Sales = 7.03 + TV*0.04

Örneğin 30 birim TV harcaması olduğunda satışların tahmini değeri ne olur?

In [None]:
7.03 + 30*0.04

In [None]:
X = df[["TV"]]
y = df["sales"]
reg = LinearRegression()
model = reg.fit(X, y)

In [None]:
model.predict([[30]])

In [None]:
yeni_veri = [[5],[90],[200]]

In [None]:
model.predict(yeni_veri)

## Artıklar ve Makine Öğrenmesindeki Önemi

In [None]:
from sklearn.metrics import mean_squared_error, r2_score

In [None]:
lm = smf.ols("sales ~ TV", df)
model = lm.fit()

In [None]:
mse = mean_squared_error(y, model.fittedvalues)

In [None]:
mse

In [None]:
import numpy as np
rmse = np.sqrt(mse)

In [None]:
rmse

In [None]:
reg.predict(X)[0:10]

In [None]:
y[0:10]

In [None]:
k_t = pd.DataFrame({"gercek_y": y[0:10],
                   "tahmin_y": reg.predict(X)[0:10]})

In [None]:
k_t

In [None]:
k_t["hata"] = k_t["gercek_y"] - k_t["tahmin_y"]

In [None]:
k_t

In [None]:
k_t["hata_kare"] = k_t["hata"]**2

In [None]:
k_t

In [None]:
np.sum(k_t["hata_kare"])

In [None]:
np.mean(k_t["hata_kare"])

In [None]:
np.sqrt(np.mean(k_t["hata_kare"]))

In [None]:
model.resid[0:10]

In [None]:
plt.plot(model.resid)

# Çoklu Doğrusal Regresyon¶


In [None]:
import pandas as pd
ad = pd.read_csv("Advertising.csv", usecols = [1,2,3,4])
df = ad.copy()
df.head()

In [10]:
from sklearn.model_selection import train_test_split, cross_val_score, cross_val_predict

In [None]:
X = df.drop("sales", axis = 1)
y = df["sales"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.20, random_state= 42)


In [None]:
X_train.shape

In [None]:
y_train.shape

In [None]:
X_test.shape

In [None]:
y_test.shape

In [None]:
training = df.copy()

In [None]:
training.shape

## Statsmodels

In [None]:
lm = sm.OLS(y_train, X_train)

In [None]:
model = lm.fit()
model.summary()

In [None]:
model.summary().tables[1]

## scikit-learn model

In [None]:
lm = LinearRegression()
model = lm.fit(X_train, y_train)

In [None]:
model.intercept_

In [None]:
model.coef_

## Tahmin

Model denklemi:

Sales = 2.97 + TV0.04 + radio0.18 + newspaper*0.002

Örneğin 30 birim TV harcaması, 10 birim radio harcamasi, 40 birimde gazete harcaması olduğunda satışların tahmini değeri ne olur?



In [None]:
yeni_veri = [[30], [10],[40]]
yeni_veri = pd.DataFrame(yeni_veri).T

In [None]:
model.predict(yeni_veri)

In [None]:
rmse = np.sqrt(mean_squared_error(y_train, model.predict(X_train)))

In [None]:
rmse

In [None]:
rmse = np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

In [None]:
rmse

## Model Tuning / Model Doğrulama

In [None]:
df.head()

In [None]:
X = df.drop('sales', axis=1)
y = df["sales"]
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.20, 
                                                    random_state=144)
lm = LinearRegression() 
model = lm.fit(X_train, y_train)

In [None]:
np.sqrt(mean_squared_error(y_train, model.predict(X_train)))

In [None]:
np.sqrt(mean_squared_error(y_test, model.predict(X_test)))

In [None]:
model.score(X_train, y_train)

In [None]:
cross_val_score(model, X_train, y_train, cv = 10, scoring = "r2").mean()

In [None]:
np.sqrt(-cross_val_score(model, 
                X_train, 
                y_train, 
                cv = 10, 
                scoring = "neg_mean_squared_error")).mean()

In [None]:
np.sqrt(-cross_val_score(model, 
                X_test, 
                y_test, 
                cv = 10, 
                scoring = "neg_mean_squared_error")).mean()

# PCR Model

In [8]:
import pandas as pd
import numpy as np
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
df.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,League,Division,PutOuts,Assists,Errors,Salary,NewLeague
1,315,81,7,24,38,39,14,3449,835,69,321,414,375,N,W,632,43,10,475.0,N
2,479,130,18,66,72,76,3,1624,457,63,224,266,263,A,W,880,82,14,480.0,A
3,496,141,20,65,78,37,11,5628,1575,225,828,838,354,N,E,200,11,3,500.0,N
4,321,87,10,39,42,30,2,396,101,12,48,46,33,N,E,805,40,4,91.5,N
5,594,169,4,74,51,35,11,4408,1133,19,501,336,194,A,W,282,421,25,750.0,A


In [None]:
df.info()

In [None]:
df.describe().T

In [2]:
dms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
dms.head()

Unnamed: 0,League_A,League_N,Division_E,Division_W,NewLeague_A,NewLeague_N
1,0,1,0,1,0,1
2,1,0,0,1,1,0
3,0,1,1,0,0,1
4,0,1,1,0,0,1
5,1,0,0,1,1,0


In [3]:
y = df["Salary"]

In [4]:
X_ = df.drop(["Salary","League","Division","NewLeague"], axis = 1).astype("float64")

In [5]:
X_.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0


In [6]:
X = pd.concat([X_, dms[["League_N", "Division_W","NewLeague_N"]]], axis = 1)
X.head()

Unnamed: 0,AtBat,Hits,HmRun,Runs,RBI,Walks,Years,CAtBat,CHits,CHmRun,CRuns,CRBI,CWalks,PutOuts,Assists,Errors,League_N,Division_W,NewLeague_N
1,315.0,81.0,7.0,24.0,38.0,39.0,14.0,3449.0,835.0,69.0,321.0,414.0,375.0,632.0,43.0,10.0,1,1,1
2,479.0,130.0,18.0,66.0,72.0,76.0,3.0,1624.0,457.0,63.0,224.0,266.0,263.0,880.0,82.0,14.0,0,1,0
3,496.0,141.0,20.0,65.0,78.0,37.0,11.0,5628.0,1575.0,225.0,828.0,838.0,354.0,200.0,11.0,3.0,1,0,1
4,321.0,87.0,10.0,39.0,42.0,30.0,2.0,396.0,101.0,12.0,48.0,46.0,33.0,805.0,40.0,4.0,1,0,1
5,594.0,169.0,4.0,74.0,51.0,35.0,11.0,4408.0,1133.0,19.0,501.0,336.0,194.0,282.0,421.0,25.0,0,1,0


In [11]:
X_train, X_test, y_train, y_test = train_test_split(X, 
                                                    y, 
                                                    test_size=0.25, 
                                                    random_state=42)

print("X_train", X_train.shape)

print("y_train",y_train.shape)

print("X_test",X_test.shape)

print("y_test",y_test.shape)

training = df.copy()

print("training", training.shape)

X_train (197, 19)
y_train (197,)
X_test (66, 19)
y_test (66,)
training (263, 20)


In [12]:
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale 
pca = PCA()

In [13]:
X_reduced_train = pca.fit_transform(scale(X_train))

In [14]:
X_reduced_train[0:1,:]

array([[-2.49569913e+00, -3.37762397e-01,  7.06391950e-01,
        -1.32791025e+00, -8.21824333e-01, -6.62790677e-01,
        -6.56764789e-01,  3.68093279e-02, -2.03665105e-01,
         1.76134815e-01, -9.20131987e-02,  2.40129020e-01,
        -3.60473661e-03, -3.41246327e-02,  4.32799605e-02,
         1.02996923e-01,  3.70733348e-03,  1.37933445e-03,
        -6.63814471e-03]])

In [19]:
np.cumsum(np.round(pca.explained_variance_ratio_, decimals = 4)*100)[0:6]

array([38.18, 59.88, 70.88, 78.88, 84.18, 88.45])

In [None]:
lm = LinearRegression()

In [None]:
pcr_model = lm.fit(X_reduced_train, y_train)

In [None]:
pcr_model.intercept_

In [None]:
pcr_model.coef_

## Tahmin

In [None]:
y_pred = pcr_model.predict(X_reduced_train)

In [None]:
y_pred[0:5]

In [None]:
np.sqrt(mean_squared_error(y_train, y_pred))

In [None]:
df["Salary"].mean()

In [None]:
r2_score(y_train, y_pred)

In [None]:
pca2 = PCA()

In [None]:
X_reduced_test = pca2.fit_transform(scale(X_test))

In [None]:
y_pred = pcr_model.predict(X_reduced_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

## Model Tuning

In [None]:
lm = LinearRegression()
pcr_model = lm.fit(X_reduced_train[:,0:10], y_train)
y_pred = pcr_model.predict(X_reduced_test[:,0:10])
print(np.sqrt(mean_squared_error(y_test, y_pred)))

In [None]:
from sklearn import model_selection

In [None]:
cv_10 = model_selection.KFold(n_splits = 10,
                             shuffle = True,
                             random_state = 1)

In [None]:
lm = LinearRegression()

In [None]:
RMSE = []

In [None]:
for i in np.arange(1, X_reduced_train.shape[1] + 1):
    
    score = np.sqrt(-1*model_selection.cross_val_score(lm, 
                                                       X_reduced_train[:,:i], 
                                                       y_train.ravel(), 
                                                       cv=cv_10, 
                                                       scoring='neg_mean_squared_error').mean())
    RMSE.append(score)

In [None]:
plt.plot(RMSE, '-v')
plt.xlabel('Bileşen Sayısı')
plt.ylabel('RMSE')
plt.title('Maaş Tahmin Modeli İçin PCR Model Tuning');

In [None]:
lm = LinearRegression()

In [None]:
pcr_model = lm.fit(X_reduced_train[:,0:6], y_train)

In [None]:
y_pred = pcr_model.predict(X_reduced_train[:,0:6])

In [None]:
print(np.sqrt(mean_squared_error(y_train, y_pred)))

In [None]:
y_pred = pcr_model.predict(X_reduced_test[:,0:6])

In [None]:
print(np.sqrt(mean_squared_error(y_test, y_pred)))

# PLS

## Model

In [None]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
ms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

In [None]:
from sklearn.cross_decomposition import PLSRegression, PLSSVD

In [None]:
pls_model = PLSRegression().fit(X_train, y_train)

In [None]:
pls_model.coef_

## Tahmin

In [None]:
X_train.head()

In [None]:
pls_model.predict(X_train)[0:10]

In [None]:
y_pred = pls_model.predict(X_train)

In [None]:
np.sqrt(mean_squared_error(y_train, y_pred))

In [None]:
r2_score(y_train, y_pred)

In [None]:
y_pred = pls_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

## Model Tuning

In [None]:
#CV
cv_10 = model_selection.KFold(n_splits=10, shuffle=True, random_state=1)


#Hata hesaplamak için döngü
RMSE = []

for i in np.arange(1, X_train.shape[1] + 1):
    pls = PLSRegression(n_components=i)
    score = np.sqrt(-1*cross_val_score(pls, X_train, y_train, cv=cv_10, scoring='neg_mean_squared_error').mean())
    RMSE.append(score)

#Sonuçların Görselleştirilmesi
plt.plot(np.arange(1, X_train.shape[1] + 1), np.array(RMSE), '-v', c = "r")
plt.xlabel('Bileşen Sayısı')
plt.ylabel('RMSE')
plt.title('Salary');

In [None]:
pls_model = PLSRegression(n_components = 2).fit(X_train, y_train)

In [None]:
y_pred = pls_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

# Ridge Regresyon

## Model

In [None]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
ms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)

In [None]:
from sklearn.linear_model import Ridge

In [None]:
ridge_model = Ridge(alpha = 0.1).fit(X_train, y_train)

In [None]:
ridge_model

In [None]:
ridge_model.coef_

In [None]:
10**np.linspace(10,-2,100)*0.5 

In [None]:
lambdalar = 10**np.linspace(10,-2,100)*0.5 

ridge_model = Ridge()
katsayilar = []

for i in lambdalar:
    ridge_model.set_params(alpha = i)
    ridge_model.fit(X_train, y_train) 
    katsayilar.append(ridge_model.coef_) 
    

    
ax = plt.gca()
ax.plot(lambdalar, katsayilar) 
ax.set_xscale('log') 

plt.xlabel('Lambda(Alpha) Değerleri')
plt.ylabel('Katsayılar/Ağırlıklar')
plt.title('Düzenlileştirmenin Bir Fonksiyonu Olarak Ridge Katsayıları');

## Tahmin

In [None]:
y_pred = ridge_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

## Model Tuning

In [None]:
lambdalar = 10**np.linspace(10,-2,100)*0.5 

In [None]:
lambdalar[0:5]

In [None]:
from sklearn.linear_model import RidgeCV
ridge_cv = RidgeCV(alphas = lambdalar, 
                   scoring = "neg_mean_squared_error",
                   normalize = True)

In [None]:
ridge_cv.fit(X_train, y_train)

In [None]:
ridge_cv.alpha_

In [None]:
ridge_tuned = Ridge(alpha = ridge_cv.alpha_, 
                   normalize = True).fit(X_train,y_train)

In [None]:
np.sqrt(mean_squared_error(y_test, ridge_tuned.predict(X_test)))

# Lasso Regresyon

## Model

In [None]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
ms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)

In [None]:
from sklearn.linear_model import Lasso

In [None]:
lasso_model = Lasso(alpha = 0.1).fit(X_train, y_train)

In [None]:
lasso_model

In [None]:
lasso_model.coef_

In [None]:
lasso = Lasso()
lambdalar = 10**np.linspace(10,-2,100)*0.5 
katsayilar = []

for i in lambdalar:
    lasso.set_params(alpha=i)
    lasso.fit(X_train, y_train)
    katsayilar.append(lasso.coef_)
    
ax = plt.gca()
ax.plot(lambdalar*2, katsayilar)
ax.set_xscale('log')
plt.axis('tight')
plt.xlabel('alpha')
plt.ylabel('weights')

## Tahmin 

In [None]:
lasso_model.predict(X_test)

In [None]:
y_pred = lasso_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

## Model Tuning

In [None]:
from sklearn.linear_model import LassoCV

In [None]:
lasso_cv_model = LassoCV(alphas = None, 
                         cv = 10, 
                         max_iter = 10000, 
                         normalize = True)

In [None]:
lasso_cv_model.fit(X_train,y_train)

In [None]:
lasso_cv_model.alpha_

In [None]:
lasso_tuned = Lasso(alpha = lasso_cv_model.alpha_)

In [None]:
lasso_tuned.fit(X_train, y_train)

In [None]:
y_pred = lasso_tuned.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

# ElasticNet Regresyonu

In [None]:
hit = pd.read_csv("Hitters.csv")
df = hit.copy()
df = df.dropna()
ms = pd.get_dummies(df[['League', 'Division', 'NewLeague']])
y = df["Salary"]
X_ = df.drop(['Salary', 'League', 'Division', 'NewLeague'], axis=1).astype('float64')
X = pd.concat([X_, dms[['League_N', 'Division_W', 'NewLeague_N']]], axis=1)
X_train, X_test, y_train, y_test = train_test_split(X, y, 
                                                    test_size=0.25, 
                                                    random_state=42)




In [None]:
from sklearn.linear_model import ElasticNet

In [None]:
enet_model = ElasticNet().fit(X_train, y_train)

In [None]:
enet_model.coef_

In [None]:
enet_model.intercept_

## Tahmin

In [None]:
enet_model

In [None]:
enet_model.predict(X_test)

In [None]:
y_pred = enet_model.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))

In [None]:
r2_score(y_test, y_pred)

## Model Tuning

In [None]:
from sklearn.linear_model import ElasticNetCV

In [None]:
enet_cv_model = ElasticNetCV(cv = 10, random_state = 0).fit(X_train, y_train)

In [None]:
enet_cv_model.alpha_

In [None]:
enet_cv_model

In [None]:
enet_tuned = ElasticNet(alpha = enet_cv_model.alpha_).fit(X_train,y_train)

In [None]:
y_pred = enet_tuned.predict(X_test)

In [None]:
np.sqrt(mean_squared_error(y_test, y_pred))