In [None]:
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
import statsmodels.api as sm
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.preprocessing import OrdinalEncoder
from sklearn.preprocessing import normalize
from sklearn.metrics import mean_squared_error
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import SelectFwe, chi2, SelectPercentile, SelectKBest
import pickle


quality = ['manufacturer', 'model', 'transmission', 'color', 'engineFuel', 'engineType', 'bodyType', 'drivetrain']
encoder = OrdinalEncoder()

def GetData():
    dataset = input('Features data: ')
    resultset = input('Outcome data: ')
    
    data = pd.read_csv(dataset)
    result = pd.read_csv(resultset)
    
    return data, result

def SaveModel(model, path):
    pickle.dump(model, open(path, 'wb'))
    print('Model saved!')
    
def LoadModel(path):
    return pickle.load(open(path, 'rb'))

def FormatCSV(data):
    data.iloc[:, :] = data.iloc[:, :].replace(to_replace = False, value = 0)
    data.iloc[:, :] = data.iloc[:, :].replace(to_replace = True, value = 1)
    data.iloc[:, :] = data.iloc[:, :].replace(to_replace = np.nan, value = 0)
    
def Convert(data):
    data.loc[:, quality] = encoder.transform(data.loc[:, quality])
    data.year = [(2020-x) for x in data.year]

    
def Transform(data, result):
    print('Transform input data set...')
    
    encoder.fit(data.loc[:, quality])

    data = data.sort_values(['id'])
    result = result.sort_values(['id'])

    data['price'] = result.price
    data = data.sort_values('price')
    result = result.sort_values('price')

    data = data.drop(['id'], axis = 1)

    Y = data.price.values

    FormatCSV(data)
    Convert(data)

    X = data.loc[:, data.corr().abs()['price'].sort_values(ascending = False)[1:].index]

    X_2 = X.loc[:, ['odometer', 'feature_7', 'feature_3', 'feature_8', 'feature_6', 'feature_5', 'feature_2', 'feature_4', 'feature_9', 'feature_1', 'feature_0']].values
    X_1 = X.loc[:, ['photos', 'engineFuel', 'bodyType', 'drivetrain', 'transmission', 'model', 'engineType', 'manufacturer', 'year', 'engineCapacity']].values
    
    X_1 = np.nan_to_num(X_1)
    X_2 = np.nan_to_num(X_2)

    from sklearn.preprocessing import PolynomialFeatures
    pr_1 = PolynomialFeatures(degree = 6, interaction_only = False)
    pr_2 = PolynomialFeatures(degree = 1, interaction_only = True)
    X_1 = pr_1.fit_transform(X_1)
    X_2 = pr_2.fit_transform(X_2)

    Z = np.concatenate((X_1, X_2),axis=1)
    
    print('Transform completed!')
    
    return Z, Y

def Predict(linear_regressor, X, Y):
    index = np.arange(0, len(X), 1)
    Y_pred = linear_regressor.predict(X)  # make predictions


    plt.scatter(index, Y)
    plt.plot(index, Y_pred, color='red')
    plt.show()

    rmse = mean_squared_error(Y, Y_pred)**0.5
    print("R2:", linear_regressor.score(X, Y))
    print("RMSE:", rmse)

def Train():
    print('Enter training data set:')
    data, result = GetData()
    
    X, Y = Transform(data, result)
    
    print('Training...')
    linear_regressor = LinearRegression(n_jobs = -1)  # create object for the class
    linear_regressor.fit(X, Y)  # perform linear regression
    
    print('Train completed!')
    
    print('Predict on training data set:')
    
    Predict(linear_regressor, X, Y)
    
    SaveModel(linear_regressor, input('Enter file name to save model: '))
    return linear_regressor

def Test():
    print('Enter testing data set:')
    data, result = GetData()
    
    X, Y = Transform(data, result)
    
    linear_regressor = LoadModel(input('Enter model file: '))
    
    print('Predict on testing data set:')
    
    Predict(linear_regressor, X, Y)
    
    return

Train()

Test()