# [Problem 1] Feature selection for practice

In [1]:
import pandas as pd

- <b>We first read the dataset, extract the explanatory variables GrLivArea and YearBuilt, and the objective variable SalePrice, respectively</b>

In [2]:
pd_train = pd.read_csv('train.csv') 
target_variable = pd_train[["GrLivArea","YearBuilt", "SalePrice"]]

FileNotFoundError: [Errno 2] No such file or directory: 'train.csv'

- ***WeVisualize the extracted variables from our data which we will be working with***

In [None]:
target_variable.head()

# [Problem 2] Estimation and evaluation by linear regression

- ***We are going to analyze and pre-process our data, then create and run a series of Linear regression training, estimation, evaluation, and visualization.***

In [None]:
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler

X = target_variable.loc[:, ["GrLivArea","YearBuilt"]]
y = target_variable["SalePrice"]

X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

scaler = StandardScaler()
scaler.fit(X_train)
X_train_trans = scaler.transform(X_train)
X_test_trans = scaler.transform(X_test)

reg = LinearRegression().fit(X_train_trans, y_train)
reg_pred = reg.predict(X_test_trans)

from sklearn.metrics import mean_squared_error
print("MSE:", mean_squared_error(y_true=y_test, y_pred=reg_pred))

- *** We are now visualizing the extracted data ***

- by using scatter plot to create a graph that allow us to compare the actual data with the predicted results

In [None]:
import matplotlib.pyplot as plt
plt.scatter(X_test.loc[:, 'GrLivArea'], y_test, label='actual')
plt.title('GrLivArea')
plt.xlabel('GrLivArea')
plt.ylabel('SalePrice')
plt.show()

plt.scatter(X_test.loc[:,'GrLivArea'],reg_pred, label='predicted')
plt.title('GrLiveArea Predicted Result')
plt.xlabel('GrLiveArea')
plt.ylabel('SalePrice')
plt.show()

plt.scatter(X_test.loc[:, 'YearBuilt'], y_test)
plt.title('YearBuilt')
plt.xlabel('YearBuilt')
plt.ylabel('SalePrice')
plt.show()

# [Problem 3] Comparison of methods

we estimate, evaluate and visualize in all of the following methods
- Linear regression
- SVM,
- Decision tree
- Random forest)
***Then, we summarize the index values in one table***

In [None]:
mSE_results = []
model_names = []

def display_outputs(title,y_prediction):
    
    #we calculate the MSE
    mSE = mean_squared_error(y_true=y_test, y_pred=y_prediction)
    
    #and we add the results into a list
    mSE_results.append(mSE)
    model_names.append(title)

    # Then we Display the results
    print(title)
    print("MSE:", mSE)
    
    # here visualize the extrated data
    plt.scatter(X_test.loc[:, 'GrLivArea'], y_test, label='actual') 
    plt.title('GrLivArea') 
    plt.xlabel('GrLivArea') 
    plt.ylabel('SalePrice') 
    plt.show()
    
    plt.scatter(X_test.loc[:, 'GrLivArea'], y_prediction)
    plt.title('GrLivArea')
    plt.xlabel('GrLivArea')
    plt.ylabel('SalePrice')
    plt.show()

    plt.scatter(X_test.loc[:, 'YearBuilt'], y_prediction)
    plt.title('YearBuilt')
    plt.xlabel('YearBuilt')
    plt.ylabel('SalePrice')
    plt.show()
    
    # This code print a line that separate the results
    print("-----------------------------------------")

from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier

# linear regression
reg = LinearRegression().fit(X_train_trans, y_train)

# prediction
reg_pred = reg.predict(X_test_trans)

# passing data into the outputing function
display_outputs('Linear Regression', reg_pred)

# SVM
clf = make_pipeline(StandardScaler(), SVC(gamma='auto'))

# fitting
clf.fit(X_train_trans, y_train)

# prediction
clf_pred = clf.predict(X_test_trans)

# passing data into the outputing function
display_outputs('SVM', clf_pred)

# Decision tree
dTC = DecisionTreeClassifier(random_state=0)

# fitting
dTC.fit(X_train_trans, y_train)

# prediction
dTC_pred = dTC.predict(X_test_trans)

# passing data into the outputing function
display_outputs('Decision tree', dTC_pred)

# Random Forest
rFC = RandomForestClassifier(max_depth=2, random_state=0)

# fitting
rFC.fit(X_train_trans, y_train)

# prediction
rFC_pred = rFC.predict(X_test_trans)

# passing data into the outputing function
display_outputs('Random Forest', rFC_pred)


In [None]:
# putting the results in a table
pd.DataFrame(mSE_results, index=model_names, columns=['MSE'])

- **Mean Squared Error(MSE)** is a measure of how close a fitted line is to data points.

- **What the figure looked like**
The smaller the **Mean Squared Error(MSE)**, the closer the fit is to the data

After splitting the dataset into train and test the dataset and we also get the predicted data from various methods,
we then calculate MSE using  each method.

- Linear Regression shows a better result because it's result is closer to 0 than the output of other methods.

# [Problem 4] (Advance assignment) Learning using other features

###### We show the data

In [None]:
pd_train.head()

In [None]:
# adding other features to the variable to be used
target_variable = pd_train[["GrLivArea","YearBuilt", "SalePrice", "LotArea", "YrSold"]]

# dividing them into different variables
X = target_variable.loc[:, ["GrLivArea","YearBuilt", "LotArea", "YrSold"]]
y = target_variable["SalePrice"]

# splitting the data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)

# standardizing the data
scaler = StandardScaler()

# fitting
scaler.fit(X_train)

# transforming
X_train_trans = scaler.transform(X_train)
X_test_trans = scaler.transform(X_test)

# fitting
reg = LinearRegression().fit(X_train_trans, y_train)

# prediction
reg_pred = reg.predict(X_test_trans)

# calculating and displaying the MSE
print("MSE:", mean_squared_error(y_true=y_test, y_pred=reg_pred))