# Here we will implement linear regression from scratch
## The task is to compare results of linear, polynomial and interactive regressions
### In addition we will implement different validation techniques

In [194]:
import pandas as pd
import numpy as np
import math

from sklearn import datasets, linear_model
from sklearn.metrics import mean_squared_error, r2_score

In [195]:
def shuffle_data_pd(df):
    #return df.reindex(np.random.permutation(df.index))
    return df.sample(frac=1).reset_index(drop=True)

    
def shuffle_data_np(dataset):
    np.random.shuffle(dataset)
    return dataset

def example():
    df = pd.DataFrame(np.random.random((6, 2)), columns=['a', 'b'])
    num = np.random.random((6, 2))
    print(df)
    print(shuffle_data_pd(df))

    print(num)

    print(shuffle_data_np(num))


In [158]:
def k_fold_cv_split_hands(dataset, k):
    N = len(dataset)
    #shuffle
    indexes = np.arange(0, N)
    indexes = shuffle_data_np(indexes)
    
    # splitting
    
    trains = []
    tests = []
    
    fraction = 1. / k
    
    for i in range(k):
        trains.append([])
        tests.append([])
        
        bottom = (int)(fraction * i * N)
        top = (int)(fraction * (i+1) * N)
        if top > N:
            top = N

        tests[i] = indexes[bottom:top]
        
        train1 = indexes[0:bottom]
        train2 = indexes[top:N]
        trains[i] = np.concatenate((train1, train2))
        
    return trains, tests
    
def k_fold_cv_tools(dataset, k):
    pass
    
def cv_example():
    df = pd.DataFrame(np.random.random((18, 2)), columns=['a', 'b'])
    print(df)
    train_indexes, test_indexes = k_fold_cv_split_hands(df, 3)
    print(train_indexes)
    print(test_indexes)



In [159]:
def k_fold_cv(dataset, k, method,  method_type, pred_cols):
    def get_error(predicted, actual, method_type):
        if method_type in 'regression': #MSE
            summ = 0
            actual = actual['mpg'].tolist()
            for i in range(len(predicted)):
                summ += (predicted[i] - actual[i]) ** 2
            return summ / len(predicted)
        else: # Classification error
            counter = 0 # WORK WITH PANDAS DF
            actual = actual['species'].tolist()
            for i in range(len(predicted)):
                counter += predicted[i] != actual[i]
            return counter
    
    mses = []
    X = dataset[pred_cols]
    Y = dataset[[col for col in dataset.columns if col not in pred_cols]]
    
    trains, tests = k_fold_cv_split_hands(dataset, k)
    for i in range(k):
        train = trains[i]
        test = tests[i]
        train_X, train_Y = X.iloc[train, :], Y.iloc[train, :]
        test_X, test_Y = X.iloc[test, :], Y.iloc[test, :]
        method.fit(train_X, train_Y)
        predict_Y = method.predict(test_X)
        mse = get_error(predict_Y, test_Y, method_type)
        mses.append(mse)
    mmse = sum(mses) / len(mses)
    return mses, mmse

In [160]:
def clean_dataset(df, column):
    average = 0.0
    N = len(df)
    indexes = []
    for index, row in df.iterrows():
        if row[column] == '?': # here also can be useful np.nan
            indexes.append(index)
        else:
            average += (float)(row[column])
    average = average * 1.0 / N
    for ind in indexes:
        df.at[ind, column] = average
        
    return df

In [202]:
class MyRegression:
    def __init__(self):
        self.bettas = []
        
        
    def gradient_descent(self, x, y, alfa, stop, iter_limit=100000, debug=False):
        '''
        x - matrix
        y - vector
        alfa - vector
        stop - const
        ---
        bettas - vector
        '''
        
        iterations = 0
        N = len(x)
        m = len(x.columns)
        
        bettas = [0] * (m + 1) # b0
        bettas_best = [0] * (m + 1)
        error = self.get_error(x, y, bettas, N)
        error_old = error
        
        while (math.fabs(error) > stop and iterations < iter_limit):
            #training
            derives = [0] * m
            for i in range(m):
                derives = self.derived_cost_function(x, y, bettas, i)
            for i in range(m):
                bettas[i] -= alfa * derives[i]
            error_old = error
            error = self.get_error(x, y, bettas, N)
            if error < error_old:
                bettas_best = list(bettas)
            if debug is True:
                if iterations % 1000 == 0:
                    print(bettas)
                    print("Error {}".format(error))
            iteration += 1
        return bettas_best
    
    def fit(self, x_train, y_train):
        self.bettas = self.gradient_descent(x_train, y_train, alfa=0.95, stop=1e-2)
    
    def predict(self, x_test):
        pass
    
    def cost_function(self, x_val, y_val, bettas):
        pass
    
    def derived_cost_function(self, x, y, bettas, index):
        pass
    
    def get_error(self, x, y, bettas, N):
        err = 0
        for i in range(N):
            err += self.cost_function(x.iloc[i].tolist(), y[i], bettas)
        return err / N

class MyLinearRegression(MyRegression):
    def __init__(self, cost_function, derive_functions=None):
        super(MyLinearRegression, self).__init__()
        self._cost_function = cost_function
        self.derive_functions = derive_functions
    
    def derived_cost_function(self, x, y, bettas, index):
        return self.derive_functions[index](x, y, bettas)

    def cost_function(self, x, y, bettas): # per one item
        return self._cost_function(x, y, bettas)
    
    def predict(x_test):
        N = len(x_test)
        results = [0] * N
        for i in range(N):
            results[i] = np.dot(([1] + x[i]), bettas)
        return results

def lin_example():
    df = pd.read_csv('datasets/mpg.csv')
    df = clean_dataset(df, 'horsepower')
    df.drop(['name', 'model_year', 'origin'], axis=1, inplace=True)
    df['horsepower'] = df.horsepower.astype(float)
    
    def cost_function(x, y, bettas):
        val = (y - (np.dot(x, bettas[1:]) + bettas[0]))
        return val ** 2
    def derives_1(x, y, bettas):
        y = y.tolist()
        x = x.iloc[i]
        return -2 * (y - np.dot(x, bettas[1:]) - bettas[0])
    def derives_2(x, y, bettas):
        print(y)
        x_val = x.iloc[i]
        return -2 * x_val[0] * (y - np.dot(x, bettas[1:]) - bettas[0])
    def derives_3(x, y, bettas):
        return -2 * x[1] * (y - np.dot(x, bettas[1:]) - bettas[0])
    def derives_4(x, y, bettas):
        return -2 * x[2] * (y - np.dot(x, bettas[1:]) - bettas[0])
    def derives_5(x, y, bettas):
        return -2 * x[3] * (y - np.dot(x, bettas[1:]) - bettas[0])
    
    lin = MyLinearRegression(cost_function, [derives_1, derives_2, derives_3, derives_4, derives_5])
    #shuffle
    df = df.sample(frac=1).reset_index(drop=True)
    
    #separation
    x, y = df[['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration']], df['mpg']
    msk = (int)(len(x) * 0.7)
    print(len(y))
    print(len(x))

    x_train = x.iloc[0:msk]
    y_train = y.iloc[0:msk]
    x_test = x.iloc[msk:-1]
    y_test = y.iloc[msk:-1]
    
    lin.fit(x_train, y_train)
    predicted = lin.predict(x_test)
    mse = 0
    for i in range(len(predicted)):
        mse += (predicted[i] - y_test[i]) ** 2 
    mse /= len(predicted)
    print(mse)
    
    #k_fold_cv(dataset, k, lin, 'regression', ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration'])
lin_example()

398
398
0      17.7
1      13.0
2      28.0
3      24.2
4      37.0
5      13.0
6      16.0
7      18.0
8      32.4
9      26.0
10     32.0
11     18.0
12     28.4
13     37.3
14     12.0
15     24.5
16     28.0
17     14.0
18     24.0
19     18.2
20     23.9
21     20.0
22     17.5
23     18.0
24     22.0
25     36.1
26     21.0
27     33.8
28     25.0
29     21.0
       ... 
248    15.0
249    25.5
250    16.0
251    13.0
252    13.0
253    37.0
254    14.0
255    18.0
256    14.0
257    17.0
258    23.0
259    25.8
260    18.0
261    33.0
262    38.0
263    24.0
264    20.5
265    43.1
266    29.9
267    18.0
268    21.5
269    15.0
270     9.0
271    16.0
272    22.0
273    14.0
274    25.0
275    18.5
276    15.5
277    29.0
Name: mpg, Length: 278, dtype: float64


NameError: free variable 'i' referenced before assignment in enclosing scope

In [163]:
df_logistic = pd.read_csv('datasets/iris.csv')

df_linear = pd.read_csv('datasets/mpg.csv')

df_linear = clean_dataset(df_linear, 'horsepower')
df_linear.drop(['name'],axis=1, inplace=True)

#print(df)
regr = linear_model.LogisticRegression()
lin = linear_model.LinearRegression()
mses, mmse = k_fold_cv(df_linear, 5, lin, 'regression', ['cylinders', 'displacement', 'horsepower', 'weight', 'acceleration', 'model_year', 'origin'])
print(mses, mmse)

[array([10.37522832]), array([9.28884745]), array([12.57132636]), array([9.34767029]), array([15.63091383])] [11.44279725]


In [143]:
df_linear = pd.read_csv('datasets/mpg.csv')
df_linear = clean_dataset(df_linear, 'horsepower')
df_linear['horsepower'] = df_linear.horsepower.astype(float)
print(df_linear.dtypes)
# df_linear = clean_dataset(df_linear, 'horsepower')

[32, 126, 330, 336, 354, 374]
mpg             float64
cylinders         int64
displacement    float64
horsepower      float64
weight            int64
acceleration    float64
model_year        int64
origin            int64
name             object
dtype: object


In [109]:
df = pd.DataFrame(np.random.random((6, 2)), columns=list('AB'))
df.iloc[[2, 3], :]

Unnamed: 0,A,B
2,0.214161,0.745174
3,0.659387,0.94561
