In [1]:
import pandas as pd

df = pd.read_csv('AmesHousing.csv')

In [2]:
df.shape

(2930, 82)

In [3]:
df.head()

Unnamed: 0,Order,PID,MS SubClass,MS Zoning,Lot Frontage,Lot Area,Street,Alley,Lot Shape,Land Contour,...,Pool Area,Pool QC,Fence,Misc Feature,Misc Val,Mo Sold,Yr Sold,Sale Type,Sale Condition,SalePrice
0,1,526301100,20,RL,141.0,31770,Pave,,IR1,Lvl,...,0,,,,0,5,2010,WD,Normal,215000
1,2,526350040,20,RH,80.0,11622,Pave,,Reg,Lvl,...,0,,MnPrv,,0,6,2010,WD,Normal,105000
2,3,526351010,20,RL,81.0,14267,Pave,,IR1,Lvl,...,0,,,Gar2,12500,6,2010,WD,Normal,172000
3,4,526353030,20,RL,93.0,11160,Pave,,Reg,Lvl,...,0,,,,0,4,2010,WD,Normal,244000
4,5,527105010,60,RL,74.0,13830,Pave,,IR1,Lvl,...,0,,MnPrv,,0,3,2010,WD,Normal,189900


In [4]:
df.SalePrice /= 1000

In [5]:
df.duplicated().sum()

np.int64(0)

In [6]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 82 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   Order            2930 non-null   int64  
 1   PID              2930 non-null   int64  
 2   MS SubClass      2930 non-null   int64  
 3   MS Zoning        2930 non-null   object 
 4   Lot Frontage     2440 non-null   float64
 5   Lot Area         2930 non-null   int64  
 6   Street           2930 non-null   object 
 7   Alley            198 non-null    object 
 8   Lot Shape        2930 non-null   object 
 9   Land Contour     2930 non-null   object 
 10  Utilities        2930 non-null   object 
 11  Lot Config       2930 non-null   object 
 12  Land Slope       2930 non-null   object 
 13  Neighborhood     2930 non-null   object 
 14  Condition 1      2930 non-null   object 
 15  Condition 2      2930 non-null   object 
 16  Bldg Type        2930 non-null   object 
 17  House Style   

In [7]:
df.describe().T

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Order,2930.0,1465.5,845.9625,1.0,733.25,1465.5,2197.75,2930.0
PID,2930.0,714464500.0,188730800.0,526301100.0,528477000.0,535453620.0,907181100.0,1007100000.0
MS SubClass,2930.0,57.38737,42.63802,20.0,20.0,50.0,70.0,190.0
Lot Frontage,2440.0,69.22459,23.36533,21.0,58.0,68.0,80.0,313.0
Lot Area,2930.0,10147.92,7880.018,1300.0,7440.25,9436.5,11555.25,215245.0
Overall Qual,2930.0,6.094881,1.411026,1.0,5.0,6.0,7.0,10.0
Overall Cond,2930.0,5.56314,1.111537,1.0,5.0,5.0,6.0,9.0
Year Built,2930.0,1971.356,30.24536,1872.0,1954.0,1973.0,2001.0,2010.0
Year Remod/Add,2930.0,1984.267,20.86029,1950.0,1965.0,1993.0,2004.0,2010.0
Mas Vnr Area,2907.0,101.8968,179.1126,0.0,0.0,0.0,164.0,1600.0


## прогнали линейную регрессию с L1 регуляризацией для выяснения наличия линейной зависимости признаков

In [8]:
import numpy as np
import pandas as pd
from tqdm import tqdm
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import Lasso
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import OneHotEncoder, RobustScaler
from sklearn.compose import ColumnTransformer
from sklearn.impute import SimpleImputer
from sklearn.metrics import r2_score, mean_absolute_error, mean_squared_error

# Целевая переменная и признаки
y = df["SalePrice"]
X = df.drop(columns=["SalePrice"])

# Разделяем типы признаков
numeric_features = X.select_dtypes(include=[np.number]).columns.tolist()
categorical_features = X.select_dtypes(include=["object"]).columns.tolist()

# Препроцессоры
numeric_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="median")),
    ("scaler", RobustScaler())
])

categorical_transformer = Pipeline(steps=[
    ("imputer", SimpleImputer(strategy="constant", fill_value="Missing")),
    ("encoder", OneHotEncoder(handle_unknown="ignore"))
])

preprocessor = ColumnTransformer([
    ("num", numeric_transformer, numeric_features),
    ("cat", categorical_transformer, categorical_features)
])

# Lasso регрессия
model = Lasso(max_iter=10000)

# Pipeline
pipe = Pipeline([
    ("preprocessing", preprocessor),
    ("regressor", model)
])

# Сетка параметров для подбора alpha
params = {
    "regressor__alpha": [0.001, 0.01, 0.1, 1, 10]
}

# Train/test split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)

# GridSearchCV
grid = GridSearchCV(pipe, param_grid=params, scoring="r2", cv=3, n_jobs=-1)
grid.fit(X_train, y_train)

# Предсказание
y_pred = grid.predict(X_test)

# Метрики
r2 = r2_score(y_test, y_pred)
mae = mean_absolute_error(y_test, y_pred)
rmse = mean_squared_error(y_test, y_pred)

print(f"Best alpha: {grid.best_params_}")
print(f"Test R²: {r2:.4f}, MAE: {mae:.2f}, RMSE: {rmse:.2f}")

# === Извлекаем коэффициенты модели ===
best_model = grid.best_estimator_.named_steps["regressor"]

# Получаем имена признаков после OneHotEncoding
feature_names = grid.best_estimator_.named_steps["preprocessing"].get_feature_names_out()
coefficients = best_model.coef_

coef_df = pd.DataFrame({
    "feature": feature_names,
    "coefficient": coefficients
}).sort_values(by="coefficient", key=np.abs, ascending=False)

# Смотрим, какие коэффициенты обнулились
zero_coefs = coef_df[coef_df["coefficient"] == 0]
nonzero_coefs = coef_df[coef_df["coefficient"] != 0]

print(f"Нулевых коэффициентов: {len(zero_coefs)} из {len(coef_df)}")
print(coef_df.head(10))
print('------------')
print(zero_coefs)


Best alpha: {'regressor__alpha': 0.1}
Test R²: 0.8111, MAE: 16.28, RMSE: 1240.27
Нулевых коэффициентов: 213 из 317
                       feature  coefficient
132     cat__Roof Matl_ClyTile  -348.964810
17            num__Gr Liv Area    35.707741
86   cat__Neighborhood_NoRidge    34.934097
286            cat__Pool QC_Ex    34.428038
93   cat__Neighborhood_StoneBr    31.533904
176         cat__Exter Qual_Ex    27.542023
241       cat__Kitchen Qual_Ex    19.750656
138     cat__Roof Matl_WdShngl    19.355904
87   cat__Neighborhood_NridgHt    18.472530
5            num__Overall Qual    18.177226
------------
                         feature  coefficient
236        cat__Electrical_FuseF          0.0
237        cat__Electrical_FuseP         -0.0
238      cat__Electrical_Missing          0.0
239          cat__Electrical_Mix          0.0
240        cat__Electrical_SBrkr          0.0
..                           ...          ...
150      cat__Exterior 1st_Stone         -0.0
151     cat__Exterio

In [9]:
features = zero_coefs.feature.to_list()
zero_features = set()
for feature in features:
    if feature.startswith('cat__'):
        cutter = feature.replace('cat__', '')
        cutter = cutter[: cutter.find('_')]
    else:
        cutter = feature.replace('num__', '')

    zero_features.add(cutter)
print('zero_features length', len(zero_features))
zero_features

zero_features length 48


{'1st Flr SF',
 'Alley',
 'Bldg Type',
 'Bsmt Cond',
 'Bsmt Exposure',
 'Bsmt Half Bath',
 'Bsmt Qual',
 'Bsmt Unf SF',
 'BsmtFin Type 1',
 'BsmtFin Type 2',
 'Central Air',
 'Condition 1',
 'Condition 2',
 'Electrical',
 'Exter Cond',
 'Exter Qual',
 'Exterior 1st',
 'Exterior 2nd',
 'Fence',
 'Fireplace Qu',
 'Foundation',
 'Functional',
 'Garage Cond',
 'Garage Finish',
 'Garage Qual',
 'Garage Type',
 'Heating',
 'Heating QC',
 'House Style',
 'Kitchen Qual',
 'Land Contour',
 'Land Slope',
 'Lot Config',
 'Lot Frontage',
 'MS Zoning',
 'Mas Vnr Type',
 'Misc Feature',
 'Neighborhood',
 'Open Porch SF',
 'Order',
 'Paved Drive',
 'Pool QC',
 'Roof Matl',
 'Roof Style',
 'Sale Condition',
 'Sale Type',
 'Street',
 'Utilities'}

In [10]:
df1 = df.drop(columns=list(zero_features))
df1.describe()

Unnamed: 0,PID,MS SubClass,Lot Area,Overall Qual,Overall Cond,Year Built,Year Remod/Add,Mas Vnr Area,BsmtFin SF 1,BsmtFin SF 2,...,Garage Area,Wood Deck SF,Enclosed Porch,3Ssn Porch,Screen Porch,Pool Area,Misc Val,Mo Sold,Yr Sold,SalePrice
count,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2907.0,2929.0,2929.0,...,2929.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0,2930.0
mean,714464500.0,57.387372,10147.921843,6.094881,5.56314,1971.356314,1984.266553,101.896801,442.629566,49.722431,...,472.819734,93.751877,23.011604,2.592491,16.002048,2.243345,50.635154,6.216041,2007.790444,180.79606
std,188730800.0,42.638025,7880.017759,1.411026,1.111537,30.245361,20.860286,179.112611,455.590839,169.168476,...,215.046549,126.361562,64.139059,25.141331,56.08737,35.597181,566.344288,2.714492,1.316613,79.886692
min,526301100.0,20.0,1300.0,1.0,1.0,1872.0,1950.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,1.0,2006.0,12.789
25%,528477000.0,20.0,7440.25,5.0,5.0,1954.0,1965.0,0.0,0.0,0.0,...,320.0,0.0,0.0,0.0,0.0,0.0,0.0,4.0,2007.0,129.5
50%,535453600.0,50.0,9436.5,6.0,5.0,1973.0,1993.0,0.0,370.0,0.0,...,480.0,0.0,0.0,0.0,0.0,0.0,0.0,6.0,2008.0,160.0
75%,907181100.0,70.0,11555.25,7.0,6.0,2001.0,2004.0,164.0,734.0,0.0,...,576.0,168.0,0.0,0.0,0.0,0.0,0.0,8.0,2009.0,213.5
max,1007100000.0,190.0,215245.0,10.0,9.0,2010.0,2010.0,1600.0,5644.0,1526.0,...,1488.0,1424.0,1012.0,508.0,576.0,800.0,17000.0,12.0,2010.0,755.0


In [11]:
df1.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2930 entries, 0 to 2929
Data columns (total 34 columns):
 #   Column           Non-Null Count  Dtype  
---  ------           --------------  -----  
 0   PID              2930 non-null   int64  
 1   MS SubClass      2930 non-null   int64  
 2   Lot Area         2930 non-null   int64  
 3   Lot Shape        2930 non-null   object 
 4   Overall Qual     2930 non-null   int64  
 5   Overall Cond     2930 non-null   int64  
 6   Year Built       2930 non-null   int64  
 7   Year Remod/Add   2930 non-null   int64  
 8   Mas Vnr Area     2907 non-null   float64
 9   BsmtFin SF 1     2929 non-null   float64
 10  BsmtFin SF 2     2929 non-null   float64
 11  Total Bsmt SF    2929 non-null   float64
 12  2nd Flr SF       2930 non-null   int64  
 13  Low Qual Fin SF  2930 non-null   int64  
 14  Gr Liv Area      2930 non-null   int64  
 15  Bsmt Full Bath   2928 non-null   float64
 16  Full Bath        2930 non-null   int64  
 17  Half Bath     

In [12]:
df1.to_csv('clean.csv')