In [29]:
import pandas as pd
from typing import Dict, Tuple
from pandas import DataFrame
from sklearn.model_selection import train_test_split
from imblearn.over_sampling import ADASYN
from sklearn.preprocessing import OneHotEncoder
import numpy as np
from sklearn import preprocessing

def split_stratified_into_train_val_test(
    df_input,
    stratify_colname="y",
    frac_train=0.6,
    frac_val=0.15,
    frac_test=0.25,
    random_state=None,
) -> Tuple[DataFrame, DataFrame, DataFrame, DataFrame, DataFrame, DataFrame]:
    """
    Splits a Pandas dataframe into three subsets (train, val, and test)
    following fractional ratios provided by the user, where each subset is
    stratified by the values in a specific column (that is, each subset has
    the same relative frequency of the values in the column). It performs this
    splitting by running train_test_split() twice.

    Parameters
    ----------
    df_input : Pandas dataframe
        Input dataframe to be split.
    stratify_colname : str
        The name of the column that will be used for stratification. Usually
        this column would be for the label.
    frac_train : float
    frac_val   : float
    frac_test  : float
        The ratios with which the dataframe will be split into train, val, and
        test data. The values should be expressed as float fractions and should
        sum to 1.0.
    random_state : int, None, or RandomStateInstance
        Value to be passed to train_test_split().

    Returns
    -------
    df_train, df_val, df_test :
        Dataframes containing the three splits.
    """

    if frac_train + frac_val + frac_test != 1.0:
        raise ValueError(
            "fractions %f, %f, %f do not add up to 1.0"
            % (frac_train, frac_val, frac_test)
        )

    if stratify_colname not in df_input.columns:
        raise ValueError("%s is not a column in the dataframe" % (stratify_colname))

    X = df_input  # Contains all columns.
    y = df_input[
        [stratify_colname]
    ]  # Dataframe of just the column on which to stratify.

    # Split original dataframe into train and temp dataframes.
    df_train, df_temp, y_train, y_temp = train_test_split(
        X, y, stratify=y, test_size=(1.0 - frac_train), random_state=random_state
    )

    if frac_val <= 0:
        assert len(df_input) == len(df_train) + len(df_temp)
        return df_train, pd.DataFrame(), df_temp, y_train, pd.DataFrame(), y_temp

    # Split the temp dataframe into val and test dataframes.
    relative_frac_test = frac_test / (frac_val + frac_test)
    df_val, df_test, y_val, y_test = train_test_split(
        df_temp,
        y_temp,
        stratify=y_temp,
        test_size=relative_frac_test,
        random_state=random_state,
    )

    assert len(df_input) == len(df_train) + len(df_val) + len(df_test)
    return df_train, df_val, df_test, y_train, y_val, y_test

df = pd.read_csv("C:/Users/ekrosz/Desktop/Лабы/vehicles.csv", sep = ',', encoding = "windows-1251")

df.info()
display(df.shape)
df.head()
display(df.isnull().any())

df["OwnersByPts"] = df["OwnersByPts"].fillna(1)
display(df.isnull().any())

display(df.OwnersByPts.value_counts())
display()

data = df[["OwnersByPts", "IssueYear", "Price"]].copy()

df_train, df_val, df_test, y_train, y_val, y_test = split_stratified_into_train_val_test(
   data, stratify_colname="OwnersByPts", frac_train=0.60, frac_val=0.20, frac_test=0.20
)

display("Обучающая выборка: ", df_train.shape)
display(df_train.OwnersByPts.value_counts())

display("Контрольная выборка: ", df_val.shape)
display(df_val.OwnersByPts.value_counts())

display("Тестовая выборка: ", df_test.shape)
display(df_test.OwnersByPts.value_counts())

ada = ADASYN()
X_resampled, y_resampled = ada.fit_resample(df_train, df_train["OwnersByPts"])
df_train_adasyn = pd.DataFrame(X_resampled)

display("Обучающая выборка после oversampling: ", df_train_adasyn.shape)
display(df_train_adasyn.OwnersByPts.value_counts())

vehicles = df[["IssueYear", "Price", "Transmission"]].copy()

encoder = OneHotEncoder(sparse_output=False, drop="first")
encoded_values = encoder.fit_transform(vehicles[["Transmission"]])
encoded_columns = encoder.get_feature_names_out(["Transmission"])
encoded_values_df = pd.DataFrame(encoded_values, columns=encoded_columns)

vehicles = pd.concat([vehicles, encoded_values_df], axis=1)
display(vehicles)

labels = ["old", "middle-year", "new"]
num_bins = 3
hist1, bins1 = np.histogram(vehicles["IssueYear"], bins=num_bins)
display(hist1, bins1)

vehicles = pd.concat([vehicles["IssueYear"], pd.cut(vehicles["IssueYear"], list(bins1), labels=labels)], axis=1)

display(vehicles)

min_max_scaler = preprocessing.MinMaxScaler(feature_range=(-1, 1))

data["PriceNorm"] = min_max_scaler.fit_transform(
    data["Price"].to_numpy().reshape(-1, 1)
).reshape(data["Price"].shape)

stndart_scaler = preprocessing.StandardScaler()

data["PriceStand"] = stndart_scaler.fit_transform(
    data["Price"].to_numpy().reshape(-1, 1)
).reshape(data["Price"].shape)

display(data)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1289 entries, 0 to 1288
Data columns (total 31 columns):
 #   Column              Non-Null Count  Dtype  
---  ------              --------------  -----  
 0   Id                  1289 non-null   object 
 1   ParsedOn            1289 non-null   object 
 2   Url                 1289 non-null   object 
 3   Name                1289 non-null   object 
 4   Price               1289 non-null   float64
 5   IssueYear           1289 non-null   int64  
 6   Generation          1289 non-null   object 
 7   Mileage             1077 non-null   float64
 8   Pts                 638 non-null    object 
 9   OwnersByPts         1077 non-null   float64
 10  State               1289 non-null   object 
 11  Modification        1289 non-null   object 
 12  Power               1288 non-null   float64
 13  EngineCapacity      1282 non-null   float64
 14  EngineType          1289 non-null   object 
 15  Transmission        1289 non-null   object 
 16  DriveU

(1289, 31)

Id                    False
ParsedOn              False
Url                   False
Name                  False
Price                 False
IssueYear             False
Generation            False
Mileage                True
Pts                    True
OwnersByPts            True
State                 False
Modification          False
Power                  True
EngineCapacity         True
EngineType            False
Transmission          False
DriveUnit             False
Equipment              True
BodyType               True
Color                 False
SteeringWheel         False
VinOrChassisNumber    False
Exchange               True
AddressRaw             True
LocationLatitude      False
LocationLongitude     False
AdditionalOptions     False
CrawlerRatingMin       True
CrawlerRatingMax       True
SellerName             True
SellerType             True
dtype: bool

Id                    False
ParsedOn              False
Url                   False
Name                  False
Price                 False
IssueYear             False
Generation            False
Mileage                True
Pts                    True
OwnersByPts           False
State                 False
Modification          False
Power                  True
EngineCapacity         True
EngineType            False
Transmission          False
DriveUnit             False
Equipment              True
BodyType               True
Color                 False
SteeringWheel         False
VinOrChassisNumber    False
Exchange               True
AddressRaw             True
LocationLatitude      False
LocationLongitude     False
AdditionalOptions     False
CrawlerRatingMin       True
CrawlerRatingMax       True
SellerName             True
SellerType             True
dtype: bool

OwnersByPts
1.0    619
2.0    242
5.0    233
3.0    195
Name: count, dtype: int64

'Обучающая выборка: '

(773, 3)

OwnersByPts
1.0    371
2.0    145
5.0    140
3.0    117
Name: count, dtype: int64

'Контрольная выборка: '

(258, 3)

OwnersByPts
1.0    124
2.0     48
5.0     47
3.0     39
Name: count, dtype: int64

'Тестовая выборка: '

(258, 3)

OwnersByPts
1.0    124
2.0     49
5.0     46
3.0     39
Name: count, dtype: int64

'Обучающая выборка: '

(773, 3)

OwnersByPts
1.0    371
2.0    145
5.0    140
3.0    117
Name: count, dtype: int64

'Обучающая выборка после oversampling: '

(1509, 3)

OwnersByPts
5.0    401
2.0    375
1.0    371
3.0    362
Name: count, dtype: int64

Unnamed: 0,IssueYear,Price,Transmission,Transmission_Вариатор,Transmission_Механика,Transmission_Робот
0,2018,1137000.0,Механика,0.0,1.0,0.0
1,1993,103000.0,Механика,0.0,1.0,0.0
2,2018,499250.0,Механика,0.0,1.0,0.0
3,2018,880000.0,Механика,0.0,1.0,0.0
4,2021,1060000.0,Механика,0.0,1.0,0.0
...,...,...,...,...,...,...
1284,2011,120000.0,Механика,0.0,1.0,0.0
1285,2020,980000.0,Механика,0.0,1.0,0.0
1286,2009,380000.0,Механика,0.0,1.0,0.0
1287,2012,315000.0,Механика,0.0,1.0,0.0


array([  10,  207, 1072])

array([1979.        , 1993.66666667, 2008.33333333, 2023.        ])

Unnamed: 0,IssueYear,IssueYear.1
0,2018,new
1,1993,old
2,2018,new
3,2018,new
4,2021,new
...,...,...
1284,2011,new
1285,2020,new
1286,2009,new
1287,2012,new


Unnamed: 0,OwnersByPts,IssueYear,Price,PriceNorm,PriceStand
0,1.0,2018,1137000.0,-0.889612,-0.138083
1,2.0,1993,103000.0,-0.990000,-0.882245
2,2.0,2018,499250.0,-0.951529,-0.597067
3,1.0,2018,880000.0,-0.914563,-0.323044
4,2.0,2021,1060000.0,-0.897087,-0.193500
...,...,...,...,...,...
1284,1.0,2011,120000.0,-0.988350,-0.870010
1285,3.0,2020,980000.0,-0.904854,-0.251075
1286,5.0,2009,380000.0,-0.963107,-0.682890
1287,5.0,2012,315000.0,-0.969417,-0.729670
