In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge, Lasso, ElasticNet, LogisticRegression
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import RandomForestClassifier
from sklearn.svm import SVC
from sklearn.metrics import mean_squared_error, r2_score, accuracy_score, classification_report

In [2]:
data1 = pd.read_csv("train.csv")
data2 = pd.read_csv("test.csv")

In [3]:
df1 = pd.DataFrame(data1)
df2 = pd.DataFrame(data2)

## train.csv

In [5]:
df1.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition,SalePrice
0,1,60,RL,65.0,8450,Pave,,Reg,Lvl,AllPub,...,0,,,,0,2,2008,WD,Normal,208500
1,2,20,RL,80.0,9600,Pave,,Reg,Lvl,AllPub,...,0,,,,0,5,2007,WD,Normal,181500
2,3,60,RL,68.0,11250,Pave,,IR1,Lvl,AllPub,...,0,,,,0,9,2008,WD,Normal,223500
3,4,70,RL,60.0,9550,Pave,,IR1,Lvl,AllPub,...,0,,,,0,2,2006,WD,Abnorml,140000
4,5,60,RL,84.0,14260,Pave,,IR1,Lvl,AllPub,...,0,,,,0,12,2008,WD,Normal,250000


In [6]:
# Finding number of null values in the dataframe
df1.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64

In [7]:
# Data PreparationHandling missing values
df1 = df1.fillna(df1.median(numeric_only=True))
df1 = df1.fillna(df1.mode().iloc[0])

In [8]:
# Getting  information about the dataframe
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1460 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          1460 non-null   object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [9]:
# Converting categorical variables to numerical ones using one-hot encoding
df1 = pd.get_dummies(df1)

In [10]:
# Spliting the data into features and target variable
X = df1.drop('SalePrice', axis=1)
y = df1['SalePrice']

In [11]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

In [12]:
# Standardize the features
scaler = StandardScaler()
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [13]:
# Simple Linear Regression
lr = LinearRegression()
lr.fit(X_train_scaled, y_train)
y_pred = lr.predict(X_test_scaled)
print("Linear Regression R2 Score:", r2_score(y_test, y_pred))

Linear Regression R2 Score: -3.245609110629437e+24


In [14]:
# Polynomial Regression
poly = PolynomialFeatures(degree=2)
X_train_poly = poly.fit_transform(X_train_scaled)
X_test_poly = poly.transform(X_test_scaled)

In [15]:
lr_poly = LinearRegression()
lr_poly.fit(X_train_poly, y_train)
y_pred_poly = lr_poly.predict(X_test_poly)
print("Polynomial Regression R2 Score:", r2_score(y_test, y_pred_poly))

Polynomial Regression R2 Score: 0.835382888170251


In [16]:
# Ridge Regression
ridge = Ridge(alpha=1.0)
ridge.fit(X_train_scaled, y_train)
y_pred_ridge = ridge.predict(X_test_scaled)
print("Ridge Regression R2 Score:", r2_score(y_test, y_pred_ridge))

Ridge Regression R2 Score: 0.8945339360052393


In [17]:
# Lasso Regression
lasso = Lasso(alpha=0.1)
lasso.fit(X_train_scaled, y_train)
y_pred_lasso = lasso.predict(X_test_scaled)
print("Lasso Regression R2 Score:", r2_score(y_test, y_pred_lasso))

Lasso Regression R2 Score: 0.8957826764772415


  model = cd_fast.enet_coordinate_descent(


In [18]:
# ElasticNet Regression
elastic = ElasticNet(alpha=0.1, l1_ratio=0.5)
elastic.fit(X_train_scaled, y_train)
y_pred_elastic = elastic.predict(X_test_scaled)
print("ElasticNet Regression R2 Score:", r2_score(y_test, y_pred_elastic))

ElasticNet Regression R2 Score: 0.8958310382083453


In [19]:
# Binary classification
y_class = (df1['SaleCondition_Normal'] == 'Normal').astype(int)
X_train_c, X_test_c, y_train_c, y_test_c = train_test_split(X, y_class, test_size=0.2, random_state=42)

In [20]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_train_scaled, y_train_c)
y_pred_nb = nb.predict(X_test_scaled)
print("Naive Bayes Accuracy:", accuracy_score(y_test_c, y_pred_nb))
print(classification_report(y_test_c, y_pred_nb))

Naive Bayes Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       292

    accuracy                           1.00       292
   macro avg       1.00      1.00      1.00       292
weighted avg       1.00      1.00      1.00       292



In [21]:
# k-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_train_scaled, y_train_c)
y_pred_knn = knn.predict(X_test_scaled)
print("k-Nearest Neighbors Accuracy:", accuracy_score(y_test_c, y_pred_knn))
print(classification_report(y_test_c, y_pred_knn))

k-Nearest Neighbors Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       292

    accuracy                           1.00       292
   macro avg       1.00      1.00      1.00       292
weighted avg       1.00      1.00      1.00       292



In [22]:
# Decision Trees
dt = DecisionTreeClassifier(random_state=42)
dt.fit(X_train_scaled, y_train_c)
y_pred_dt = dt.predict(X_test_scaled)
print("Decision Trees Accuracy:", accuracy_score(y_test_c, y_pred_dt))
print(classification_report(y_test_c, y_pred_dt))

Decision Trees Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       292

    accuracy                           1.00       292
   macro avg       1.00      1.00      1.00       292
weighted avg       1.00      1.00      1.00       292



In [23]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100, random_state=42)
rf.fit(X_train_scaled, y_train_c)
y_pred_rf = rf.predict(X_test_scaled)
print("Random Forest Accuracy:", accuracy_score(y_test_c, y_pred_rf))
print(classification_report(y_test_c, y_pred_rf))

Random Forest Accuracy: 1.0
              precision    recall  f1-score   support

           0       1.00      1.00      1.00       292

    accuracy                           1.00       292
   macro avg       1.00      1.00      1.00       292
weighted avg       1.00      1.00      1.00       292



## test.csv

In [25]:
df2.head()

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,,Reg,Lvl,AllPub,...,120,0,,MnPrv,,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,,IR1,Lvl,AllPub,...,0,0,,,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,,IR1,Lvl,AllPub,...,0,0,,MnPrv,,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,,IR1,Lvl,AllPub,...,0,0,,,,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,,IR1,HLS,AllPub,...,144,0,,,,0,1,2010,WD,Normal


In [26]:
# Finding number of null values in the dataframe
df2.isnull().sum()

Id                 0
MSSubClass         0
MSZoning           4
LotFrontage      227
LotArea            0
                ... 
MiscVal            0
MoSold             0
YrSold             0
SaleType           1
SaleCondition      0
Length: 80, dtype: int64

In [27]:
# Data Preparation - Handling missing values
df2.fillna(df2.median(numeric_only=True))
df2.fillna(df2.mode().iloc[0])

Unnamed: 0,Id,MSSubClass,MSZoning,LotFrontage,LotArea,Street,Alley,LotShape,LandContour,Utilities,...,ScreenPorch,PoolArea,PoolQC,Fence,MiscFeature,MiscVal,MoSold,YrSold,SaleType,SaleCondition
0,1461,20,RH,80.0,11622,Pave,Grvl,Reg,Lvl,AllPub,...,120,0,Ex,MnPrv,Shed,0,6,2010,WD,Normal
1,1462,20,RL,81.0,14267,Pave,Grvl,IR1,Lvl,AllPub,...,0,0,Ex,MnPrv,Gar2,12500,6,2010,WD,Normal
2,1463,60,RL,74.0,13830,Pave,Grvl,IR1,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,3,2010,WD,Normal
3,1464,60,RL,78.0,9978,Pave,Grvl,IR1,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,6,2010,WD,Normal
4,1465,120,RL,43.0,5005,Pave,Grvl,IR1,HLS,AllPub,...,144,0,Ex,MnPrv,Shed,0,1,2010,WD,Normal
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1454,2915,160,RM,21.0,1936,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,6,2006,WD,Normal
1455,2916,160,RM,21.0,1894,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,4,2006,WD,Abnorml
1456,2917,20,RL,160.0,20000,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,0,9,2006,WD,Abnorml
1457,2918,85,RL,62.0,10441,Pave,Grvl,Reg,Lvl,AllPub,...,0,0,Ex,MnPrv,Shed,700,7,2006,WD,Normal


In [40]:
# Drop rows with missing target values for regression
df2 = df2.dropna(subset=['LotFrontage'])

# Handle missing values in predictors
df2 = df2.fillna(df2.median(numeric_only=True))

# Split the data into features (X) and target (y) for regression tasks
X_reg = df2.drop(['LotFrontage'], axis=1)
y_reg = df2['LotFrontage']

# Split the data into features (X) and target (y) for classification tasks
X_clf = df2.drop(['SaleCondition'], axis=1)
y_clf = df2['SaleCondition']

# Encode categorical variables
X_reg = pd.get_dummies(X_reg)
X_clf = pd.get_dummies(X_clf)

# Standardize features
scaler = StandardScaler()
X_reg = scaler.fit_transform(X_reg)
X_clf = scaler.fit_transform(X_clf)

# Split the data into training and testing sets
X_reg_train, X_reg_test, y_reg_train, y_reg_test = train_test_split(X_reg, y_reg, test_size=0.2, random_state=42)
X_clf_train, X_clf_test, y_clf_train, y_clf_test = train_test_split(X_clf, y_clf, test_size=0.2, random_state=42)

In [42]:
# Simple Linear Regression
lin_reg = LinearRegression()
lin_reg.fit(X_reg_train, y_reg_train)
y_pred = lin_reg.predict(X_reg_test)
print(f'Simple Linear Regression RMSE: {np.sqrt(mean_squared_error(y_reg_test, y_pred))}')

Simple Linear Regression RMSE: 305566924444509.2


In [44]:
# Polynomial Regression
poly_features = PolynomialFeatures(degree=2)
X_poly_train = poly_features.fit_transform(X_reg_train)
X_poly_test = poly_features.transform(X_reg_test)

poly_reg = LinearRegression()
poly_reg.fit(X_poly_train, y_reg_train)
y_poly_pred = poly_reg.predict(X_poly_test)
print(f'Polynomial Regression RMSE: {np.sqrt(mean_squared_error(y_reg_test, y_poly_pred))}')

Polynomial Regression RMSE: 18.651439607884264


In [46]:
# Ridge Regression
ridge_reg = Ridge(alpha=1.0)
ridge_reg.fit(X_reg_train, y_reg_train)
y_ridge_pred = ridge_reg.predict(X_reg_test)
print(f'Ridge Regression RMSE: {np.sqrt(mean_squared_error(y_reg_test, y_ridge_pred))}')

Ridge Regression RMSE: 14.046955006933109


In [48]:
# Lasso Regression
lasso_reg = Lasso(alpha=0.1)
lasso_reg.fit(X_reg_train, y_reg_train)
y_lasso_pred = lasso_reg.predict(X_reg_test)
print(f'Lasso Regression RMSE: {np.sqrt(mean_squared_error(y_reg_test, y_lasso_pred))}')

Lasso Regression RMSE: 13.12381652119041


In [50]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_clf_train, y_clf_train)
y_log_pred = log_reg.predict(X_clf_test)
print(f'Logistic Regression Accuracy: {accuracy_score(y_clf_test, y_log_pred)}')

Logistic Regression Accuracy: 0.8461538461538461


In [None]:
# Logistic Regression
log_reg = LogisticRegression(max_iter=1000)
log_reg.fit(X_clf_train, y_clf_train)
y_log_pred = log_reg.predict(X_clf_test)
print(f'Logistic Regression Accuracy: {accuracy_score(y_clf_test, y_log_pred)}')

In [52]:
# Naive Bayes
nb = GaussianNB()
nb.fit(X_clf_train, y_clf_train)
y_nb_pred = nb.predict(X_clf_test)
print(f'Naive Bayes Accuracy: {accuracy_score(y_clf_test, y_nb_pred)}')

Naive Bayes Accuracy: 0.3076923076923077


In [54]:
# k-Nearest Neighbors
knn = KNeighborsClassifier(n_neighbors=5)
knn.fit(X_clf_train, y_clf_train)
y_knn_pred = knn.predict(X_clf_test)
print(f'k-Nearest Neighbors Accuracy: {accuracy_score(y_clf_test, y_knn_pred)}')

k-Nearest Neighbors Accuracy: 0.8704453441295547


In [56]:
# Decision Trees
dt = DecisionTreeClassifier()
dt.fit(X_clf_train, y_clf_train)
y_dt_pred = dt.predict(X_clf_test)
print(f'Decision Trees Accuracy: {accuracy_score(y_clf_test, y_dt_pred)}')

Decision Trees Accuracy: 0.8502024291497976


In [58]:
# Random Forest
rf = RandomForestClassifier(n_estimators=100)
rf.fit(X_clf_train, y_clf_train)
y_rf_pred = rf.predict(X_clf_test)
print(f'Random Forest Accuracy: {accuracy_score(y_clf_test, y_rf_pred)}')

Random Forest Accuracy: 0.9109311740890689


In [60]:
# Support Vector Machines
svc = SVC()
svc.fit(X_clf_train, y_clf_train)
y_svc_pred = svc.predict(X_clf_test)
print(f'Support Vector Machines Accuracy: {accuracy_score(y_clf_test, y_svc_pred)}')

Support Vector Machines Accuracy: 0.9068825910931174
