In [1]:
import pandas as pd
import numpy as np

from sklearn.model_selection import train_test_split
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import OneHotEncoder
from sklearn.decomposition import PCA
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import StandardScaler

Pyarrow will become a required dependency of pandas in the next major release of pandas (pandas 3.0),
(to allow more performant data types, such as the Arrow string type, and better interoperability with other libraries)
but was not found to be installed on your system.
If this would cause problems for you,
please provide us feedback at https://github.com/pandas-dev/pandas/issues/54466
        
  import pandas as pd


# Обработка данных

Краткое описание данных:
* SalePrice - the property's sale price in dollars. This is the target variable that you're trying to predict.
* MSSubClass: The building class
* MSZoning: The general zoning classification
* LotFrontage: Linear feet of street connected to property
* LotArea: Lot size in square feet
* Street: Type of road access
* Alley: Type of alley access
* LotShape: General shape of property
* LandContour: Flatness of the property
* Utilities: Type of utilities available
* LotConfig: Lot configuration
* LandSlope: Slope of property
* Neighborhood: Physical locations within Ames city limits
* Condition1: Proximity to main road or railroad
* Condition2: Proximity to main road or railroad (if a second is present)
* BldgType: Type of dwelling
* HouseStyle: Style of dwelling
* OverallQual: Overall material and finish quality
* OverallCond: Overall condition rating
* YearBuilt: Original construction date
* YearRemodAdd: Remodel date
* RoofStyle: Type of roof
* RoofMatl: Roof material
* Exterior1st: Exterior covering on house
* Exterior2nd: Exterior covering on house (if more than one material)
* MasVnrType: Masonry veneer type
* MasVnrArea: Masonry veneer area in square feet
* ExterQual: Exterior material quality
* ExterCond: Present condition of the material on the exterior
* Foundation: Type of foundation
* BsmtQual: Height of the basement
* BsmtCond: General condition of the basement
* BsmtExposure: Walkout or garden level basement walls
* BsmtFinType1: Quality of basement finished area
* BsmtFinSF1: Type 1 finished square feet
* BsmtFinType2: Quality of second finished area (if present)
* BsmtFinSF2: Type 2 finished square feet
* BsmtUnfSF: Unfinished square feet of basement area
* TotalBsmtSF: Total square feet of basement area
* Heating: Type of heating
* HeatingQC: Heating quality and condition
* CentralAir: Central air conditioning
* Electrical: Electrical system
* 1stFlrSF: First Floor square feet
* 2ndFlrSF: Second floor square feet
* LowQualFinSF: Low quality finished square feet (all floors)
* GrLivArea: Above grade (ground) living area square feet
* BsmtFullBath: Basement full bathrooms
* BsmtHalfBath: Basement half bathrooms
* FullBath: Full bathrooms above grade
* HalfBath: Half baths above grade
* Bedroom: Number of bedrooms above basement level
* Kitchen: Number of kitchens
* KitchenQual: Kitchen quality
* TotRmsAbvGrd: Total rooms above grade (does not include bathrooms)
* Functional: Home functionality rating
* Fireplaces: Number of fireplaces
* FireplaceQu: Fireplace quality
* GarageType: Garage location
* GarageYrBlt: Year garage was built
* GarageFinish: Interior finish of the garage
* GarageCars: Size of garage in car capacity
* GarageArea: Size of garage in square feet
* GarageQual: Garage quality
* GarageCond: Garage condition
* PavedDrive: Paved driveway
* WoodDeckSF: Wood deck area in square feet
* OpenPorchSF: Open porch area in square feet
* EnclosedPorch: Enclosed porch area in square feet
* 3SsnPorch: Three season porch area in square feet
* ScreenPorch: Screen porch area in square feet
* PoolArea: Pool area in square feet
* PoolQC: Pool quality
* Fence: Fence quality
* MiscFeature: Miscellaneous feature not covered in other categories
* MiscVal: $Value of miscellaneous feature
* MoSold: Month Sold
* YrSold: Year Sold
* SaleType: Type of sale
* SaleCondition: Condition of sale

In [2]:
train = pd.read_csv('train_price.csv')
labels = train["SalePrice"]
train = train.drop('SalePrice', axis=1)

In [3]:
X_train, X_test, y_train, y_test = train_test_split(train, labels, test_size=0.33, random_state=142)

In [4]:
nans=pd.isnull(X_train).sum()
nans[nans>0]

LotFrontage     173
Alley           917
MasVnrType      592
MasVnrArea        5
BsmtQual         24
BsmtCond         24
BsmtExposure     25
BsmtFinType1     24
BsmtFinType2     25
Electrical        1
FireplaceQu     464
GarageType       63
GarageYrBlt      63
GarageFinish     63
GarageQual       63
GarageCond       63
PoolQC          975
Fence           799
MiscFeature     945
dtype: int64

In [5]:
X_train=X_train.drop(["Id","Alley", "Fence", "MiscFeature", "PoolQC", "FireplaceQu"], axis=1)
X_test=X_test.drop(["Id","Alley", "Fence", "MiscFeature", "PoolQC", "FireplaceQu"], axis=1)

In [6]:
all_columns = X_train.columns.values
non_categorical = ["LotFrontage", "LotArea", "MasVnrArea", "BsmtFinSF1", 
                   "BsmtFinSF2", "BsmtUnfSF", "TotalBsmtSF", "1stFlrSF", 
                   "2ndFlrSF", "LowQualFinSF", "GrLivArea", "GarageArea", 
                   "WoodDeckSF", "OpenPorchSF", "EnclosedPorch", "3SsnPorch", 
                   "ScreenPorch","PoolArea", "MiscVal"]

catigorial = [value for value in all_columns if value not in non_categorical]

In [7]:
enc = OneHotEncoder(handle_unknown='ignore')
enc.fit(X_train[catigorial])

features_name = enc.get_feature_names_out(catigorial)
X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
X_train = X_train.drop(catigorial, axis=1)

X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
X_test = X_test.drop(catigorial, axis=1)

  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[catigorial]).toarray()
  X_train[features_name] =  enc.transform(X_train[c

  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_name] = enc.transform(X_test[catigorial]).toarray()
  X_test[features_na

In [8]:
imp = SimpleImputer(missing_values=np.nan, strategy='most_frequent')
X_train = imp.fit_transform(X_train)
X_test = imp.transform(X_test)

In [9]:
X_train_clear, X_test_clear = X_train, X_test

# Sklearn

In [10]:
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)

In [11]:
model = LinearRegression()
model.fit(X_train_clear, y_train)
print(f"R2 на train: {model.score(X_train_clear, y_train)}")
print(f"R2 на test: {model.score(X_test_clear, y_test)}")

R2 на train: 0.9686156269055428
R2 на test: -3682507.8160489406


In [12]:
pca = PCA(whiten=True)
pca.fit(X_train)
variance = pd.DataFrame(pca.explained_variance_ratio_)
np.cumsum(pca.explained_variance_ratio_)

array([0.03572396, 0.0544179 , 0.0693919 , 0.08300038, 0.09620454,
       0.10825926, 0.11833344, 0.1278904 , 0.13647934, 0.14487536,
       0.15263502, 0.16013307, 0.16733551, 0.17423849, 0.1810798 ,
       0.18769729, 0.19428796, 0.20072651, 0.20706908, 0.21329527,
       0.21946153, 0.22558031, 0.23163863, 0.23756624, 0.24346707,
       0.24923681, 0.25496191, 0.26062101, 0.26619557, 0.27173594,
       0.27721081, 0.28263934, 0.28803771, 0.29337935, 0.29866513,
       0.30390255, 0.30910821, 0.31428902, 0.31941817, 0.32445807,
       0.32949511, 0.33447658, 0.33942826, 0.34434951, 0.34924339,
       0.35410933, 0.35889738, 0.36366626, 0.36840056, 0.37312894,
       0.37780013, 0.38246219, 0.38711552, 0.39174386, 0.39632969,
       0.40088631, 0.40542406, 0.40993305, 0.41441793, 0.41889081,
       0.42333665, 0.42776302, 0.43214578, 0.43649529, 0.44082664,
       0.44512242, 0.44939856, 0.45366378, 0.4579062 , 0.462118  ,
       0.46631246, 0.47046225, 0.47459704, 0.47867818, 0.48275

In [13]:
np.where(np.cumsum(pca.explained_variance_ratio_) == 1.)

(array([], dtype=int64),)

In [14]:
pca = PCA(n_components=30, whiten=True)
pca = pca.fit(X_train)
X_train = pca.transform(X_train)
X_test = pca.transform(X_test)

In [15]:
model = LinearRegression()
model.fit(X_train, y_train)
print(f"R2 на train: {model.score(X_train, y_train)}")
print(f"R2 на test: {model.score(X_test, y_test)}")

R2 на train: 0.8117227789342677
R2 на test: 0.8345287984774606


# PCA по шагам

Реализовать PCA по шагам (слайд 12 презентации) самостоятельно. Применить к данным, которые мы смотрели выше. Посмотрить модель линейной регрессии, сравнить качество модели.

In [39]:
X_train, X_test = X_train_clear, X_test_clear

In [40]:
from sklearn.preprocessing import StandardScaler
#для стандартизации признаков
scaler = StandardScaler()
#вычисление параметров для стандартизации
scaler.fit(X_train)
#дф из стандартизированных данных
X_train = pd.DataFrame(scaler.transform(X_train), columns=scaler.get_feature_names_out())
X_test = pd.DataFrame(scaler.transform(X_test), columns=scaler.get_feature_names_out())

In [41]:
#вычисление ковариационной матрицы и собственных значений в-в
cov = np.cov(X_train, rowvar=False)

In [42]:
eigenvalues, eigenvectors = np.linalg.eig(cov)

In [43]:
eigenvalues = np.real(eigenvalues)
eigenvectors = np.real(eigenvectors)

In [44]:
u = eigenvectors[:, :30]

In [45]:
#применяем преобразование PCA к обучающим +тестовым данным
X_train_pca = np.dot(X_train, u)

In [46]:
X_test_pca = np.dot(X_test, u)

In [47]:
#обучение модели линейной регрессии 
model = LinearRegression()
model.fit(X_train_pca, y_train)
print(f"R2 на train: {model.score(X_train_pca, y_train)}")
print(f"R2 на test: {model.score(X_test_pca, y_test)}")

R2 на train: 0.8152783713716296
R2 на test: 0.8313419309579585
