In [144]:
import pandas as pd
from sklearn.linear_model import LinearRegression
from sklearn.impute import SimpleImputer
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_squared_error, r2_score

In [109]:
# Load the dataset
df = pd.read_csv('train.csv')


In [110]:
# Print the first few rows of the dataset
print(df.head())

# Print the column names
print(df.columns)


   Id  MSSubClass MSZoning  LotFrontage  LotArea Street Alley LotShape  \
0   1          60       RL         65.0     8450   Pave   NaN      Reg   
1   2          20       RL         80.0     9600   Pave   NaN      Reg   
2   3          60       RL         68.0    11250   Pave   NaN      IR1   
3   4          70       RL         60.0     9550   Pave   NaN      IR1   
4   5          60       RL         84.0    14260   Pave   NaN      IR1   

  LandContour Utilities  ... PoolArea PoolQC Fence MiscFeature MiscVal MoSold  \
0         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
1         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      5   
2         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      9   
3         Lvl    AllPub  ...        0    NaN   NaN         NaN       0      2   
4         Lvl    AllPub  ...        0    NaN   NaN         NaN       0     12   

  YrSold  SaleType  SaleCondition  SalePrice  
0   2008        WD   

In [111]:
print("Distinct count in 'GrLivArea':", df['GrLivArea'].nunique())
print("Distinct count in 'Bedroom':", df['BedroomAbvGr'].nunique())
print("Distinct count in 'FullBath':", df['FullBath'].nunique())
print("Distinct count in 'SalePrice':", df['SalePrice'].nunique())


Distinct count in 'GrLivArea': 861
Distinct count in 'Bedroom': 8
Distinct count in 'FullBath': 4
Distinct count in 'SalePrice': 663


In [112]:
# Display distinct (unique) values for the specified columns
print("Distinct values in 'GrLivArea':")
print(df['GrLivArea'].unique())

print("\nDistinct values in 'Bedroom':")
print(df['BedroomAbvGr'].unique())

print("\nDistinct values in 'FullBath':")
print(df['FullBath'].unique())

print("\nDistinct values in 'SalePrice':")
print(df['SalePrice'].unique())


Distinct values in 'GrLivArea':
[1710 1262 1786 1717 2198 1362 1694 2090 1774 1077 1040 2324  912 1494
 1253  854 1004 1296 1114 1339 2376 1108 1795 1060 1600  900 1704  520
 1317 1228 1234 1700 1561 2452 1097 1297 1057 1152 1324 1328  884  938
 1150 1752 2149 1656 1452  955 1470 1176  816 1842 1360 1425 1739 1720
 2945  780 1158 1111 1370 2034 2473 2207 1479  747 2287 2223  845 1718
 1086 1605  988  952 1285 1768 1230 2142 1337 1563 1065 1474 2417 1560
 1224 1526  990 1235  964 2291 1588  960  835 1225 1610 1732 1535 1226
 1818 1992 1047  789 1517 1844 1855 1430 2696 2259 2320 1458 1092 1125
 3222 1456 1123 1080 1199 1586  754  958  840 1348 1053 2157 2054 1327
 1721 1682 1214 1959 1852 1764  864 1734 1385 1501 1728 1709  875 2035
 1344  969 1993 1252 1200 1096 1968 1947 2462 1232 2668 1541  882 1616
 1355 1867 2161 1707 1382 1767 1651 2158 2060 1920 2234  968 1525 1802
 1340 2082 3608 1217 1593 2727 1431 1726 3112 2229 1713 1121 1279 1310
  848 1284 1442 1696 1100 2062 1212 1392 1236

In [113]:
# Understand the dataset structure
print(df.info())



<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1460 entries, 0 to 1459
Data columns (total 81 columns):
 #   Column         Non-Null Count  Dtype  
---  ------         --------------  -----  
 0   Id             1460 non-null   int64  
 1   MSSubClass     1460 non-null   int64  
 2   MSZoning       1460 non-null   object 
 3   LotFrontage    1201 non-null   float64
 4   LotArea        1460 non-null   int64  
 5   Street         1460 non-null   object 
 6   Alley          91 non-null     object 
 7   LotShape       1460 non-null   object 
 8   LandContour    1460 non-null   object 
 9   Utilities      1460 non-null   object 
 10  LotConfig      1460 non-null   object 
 11  LandSlope      1460 non-null   object 
 12  Neighborhood   1460 non-null   object 
 13  Condition1     1460 non-null   object 
 14  Condition2     1460 non-null   object 
 15  BldgType       1460 non-null   object 
 16  HouseStyle     1460 non-null   object 
 17  OverallQual    1460 non-null   int64  
 18  OverallC

In [114]:
print(df.describe())

                Id   MSSubClass  LotFrontage        LotArea  OverallQual  \
count  1460.000000  1460.000000  1201.000000    1460.000000  1460.000000   
mean    730.500000    56.897260    70.049958   10516.828082     6.099315   
std     421.610009    42.300571    24.284752    9981.264932     1.382997   
min       1.000000    20.000000    21.000000    1300.000000     1.000000   
25%     365.750000    20.000000    59.000000    7553.500000     5.000000   
50%     730.500000    50.000000    69.000000    9478.500000     6.000000   
75%    1095.250000    70.000000    80.000000   11601.500000     7.000000   
max    1460.000000   190.000000   313.000000  215245.000000    10.000000   

       OverallCond    YearBuilt  YearRemodAdd   MasVnrArea   BsmtFinSF1  ...  \
count  1460.000000  1460.000000   1460.000000  1452.000000  1460.000000  ...   
mean      5.575342  1971.267808   1984.865753   103.685262   443.639726  ...   
std       1.112799    30.202904     20.645407   181.066207   456.098091  ..

In [115]:
# Check for null values in the dataset
null_values = df.isnull().sum()
print(null_values)


Id                 0
MSSubClass         0
MSZoning           0
LotFrontage      259
LotArea            0
                ... 
MoSold             0
YrSold             0
SaleType           0
SaleCondition      0
SalePrice          0
Length: 81, dtype: int64


In [116]:
# Check the data type of the LotFrontage column
print(df['LotFrontage'].dtypes)

# Alternative: Use type() for a single value in the column
print(type(df['LotFrontage'].iloc[0]))


float64
<class 'numpy.float64'>


In [122]:
df['LotFrontage'] = df['LotFrontage'].fillna(df['LotFrontage'].median())
#df.fillna(df.mean(), inplace=True)


In [123]:
# Check for NaN values in the 'LotFrontage' column
nan_values = df['LotFrontage'].isna().sum()

# Print the number of NaN values in 'LotFrontage'
print(f'Number of NaN values in LotFrontage: {nan_values}')


Number of NaN values in LotFrontage: 0


In [129]:
# Check for columns with null values
null_columns = df.columns[df.isnull().any()]

# Display the columns that have null values
print("Columns with null values:")
print(null_columns)



Columns with null values:
Index(['Alley', 'MasVnrType', 'MasVnrArea', 'BsmtQual', 'BsmtCond',
       'BsmtExposure', 'BsmtFinType1', 'BsmtFinType2', 'Electrical',
       'FireplaceQu', 'GarageType', 'GarageYrBlt', 'GarageFinish',
       'GarageQual', 'GarageCond', 'PoolQC', 'Fence', 'MiscFeature'],
      dtype='object')


In [133]:
print(df['LotFrontage'].isna().sum())
print(df['SalePrice'].isna().sum())
print(df['FullBath'].isna().sum())
print(df['BedroomAbvGr'].isna().sum())
print(df['PoolQC'].isna().sum())


0
0
0
0
1453


In [134]:
# Separate features (X) and target variable (SalePrice)
X = df.drop('SalePrice', axis=1)
y = df['SalePrice']


In [137]:
# Step 1: Impute missing values in numeric columns (fill NaN with median)
numeric_cols = X.select_dtypes(include=['number']).columns
imputer_num = SimpleImputer(strategy='median')
X[numeric_cols] = imputer_num.fit_transform(X[numeric_cols])

# Step 2: Impute missing values in categorical columns (fill NaN with most frequent value)
categorical_cols = X.select_dtypes(include=['object']).columns
imputer_cat = SimpleImputer(strategy='most_frequent')
X[categorical_cols] = imputer_cat.fit_transform(X[categorical_cols])

# Step 3: One-hot encode the categorical columns
X = pd.get_dummies(X, columns=categorical_cols, drop_first=True)

In [138]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)



In [140]:
# Check if there are any NaN values left in X_train or X_test
print("Missing values in X_train after imputation:")
print(X_train.isna().sum())  # Shows count of NaNs in each column

print("\nMissing values in X_test after imputation:")
print(y_test.isna().sum())  # Shows count of NaNs in each column


Missing values in X_train after imputation:
Id                       0
MSSubClass               0
LotFrontage              0
LotArea                  0
OverallQual              0
                        ..
SaleCondition_AdjLand    0
SaleCondition_Alloca     0
SaleCondition_Family     0
SaleCondition_Normal     0
SaleCondition_Partial    0
Length: 245, dtype: int64

Missing values in X_test after imputation:
0


In [141]:
import pandas as pd

# Assuming your DataFrame is named 'df'
# ... (your code to load the dataset) ...

# Select columns with object (string) data type
non_numerical_data = df.select_dtypes(include=['object'])

# Display the non-numerical data
print(non_numerical_data)

# Alternatively, you can display the first few rows of the non-numerical data:
print(non_numerical_data.head())

     MSZoning Street Alley LotShape LandContour Utilities LotConfig LandSlope  \
0          RL   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl   
1          RL   Pave   NaN      Reg         Lvl    AllPub       FR2       Gtl   
2          RL   Pave   NaN      IR1         Lvl    AllPub    Inside       Gtl   
3          RL   Pave   NaN      IR1         Lvl    AllPub    Corner       Gtl   
4          RL   Pave   NaN      IR1         Lvl    AllPub       FR2       Gtl   
...       ...    ...   ...      ...         ...       ...       ...       ...   
1455       RL   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl   
1456       RL   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl   
1457       RL   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl   
1458       RL   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl   
1459       RL   Pave   NaN      Reg         Lvl    AllPub    Inside       Gtl   

     Neighborhood Condition

In [142]:
model = LinearRegression()
model.fit(X_train, y_train)

In [143]:
y_pred = model.predict(X_test)

In [145]:
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

In [146]:
print(f'Mean Squared Error: {mse}')
print(f'R^2 Score: {r2}')

Mean Squared Error: 2642483715.1941447
R^2 Score: 0.6554926552252592


In [147]:
#Save the predictions to a new CSV file
results = pd.DataFrame({
    'Actual': y_test,
    'Predicted': y_pred
})

In [148]:
results.to_csv('predicted_saleprice.csv', index=False)