In [21]:
import pandas as pd
from os import listdir
from os.path import isfile, join
import numpy as np
from sklearn.tree import DecisionTreeClassifier 
from sklearn.model_selection import train_test_split 
from sklearn import metrics 
from sklearn.preprocessing import OneHotEncoder
import sklearn
import gc
import sys
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LinearRegression,SGDRegressor
base_path = '/kaggle/input/japan-real-estate-transaction-prices/'
from copy import deepcopy as d
from sklearn.metrics import r2_score
import itertools
import matplotlib.pyplot as plt

In [22]:
def get_dataset(dataset_path: str, codes_path: str) -> pd.DataFrame:
    
    only_files = (join(dataset_path, f) for f in listdir(dataset_path) if isfile(join(dataset_path, f)))

    ### Połącz listę DataFrame'ów w jeden główny DataFrame    
    main_df = pd.concat((pd.read_csv(f, low_memory=False) for f in only_files))
    ###
    
    codes = pd.read_csv(codes_path)
    
    ### Usuń kolumnę JpName z DataFrameu codes
    codes = codes.drop(columns=['JpName'])
    ###
    
    ### Zmień nazwę kolumny Quarter na Code
    main_df = main_df.rename(columns={"Quarter": "Code"})
    ###
    
    main_df = main_df.join(codes, on='Code', rsuffix="_pref")
    
    return main_df

In [23]:
def data_preprocessing(df: pd.DataFrame, target: str, columns_to_drop: list) -> (pd.DataFrame, pd.DataFrame, list, list):
    
    df = df.drop(columns=columns_to_drop)
    target='TradePrice'
    df = df.dropna(subset=[target])
    
    categorical_columns = df.select_dtypes(include=[np.object]).columns.tolist()
    numerical_columns = df.select_dtypes(include=[np.number]).columns.tolist()
    
    df_cat = df[categorical_columns]
    df_cat = df_cat.fillna("Unknown")
    
    ### Dokonaj woboru kolumn numerycznych DataFrame'u "df" do zmiennej df_num
    df_num = df[numerical_columns]
    
    ### Wypełnij brakujące wartości DataFrame'u df_num średnią poszczególnych kolumn (funkcjia mean())
    df_num = df_num.fillna(df_num.mean())
    ###
    
    return df_cat, df_num, categorical_columns, numerical_columns
    

In [24]:
def get_encoder(df: pd.DataFrame, target: str, columns_to_drop: list) -> (pd.DataFrame, sklearn.preprocessing.OneHotEncoder):
    
    enc = OneHotEncoder(handle_unknown='error', drop='first')
    scaler = StandardScaler()
    
    ### Użyj poprzednio napisanej funkcji (data_preprocessing) by przetworzyć dane w sposób nadający się do enkodingu
    df_cat, df_num, categorical_columns, numerical_columns = data_preprocessing(df, target, columns_to_drop)
    ###
    
    enc.fit(df_cat)
    scaler.fit(df_num)
    
    df = df_cat
    
    for column in df_num.columns:
        df[column] = df_num[column]
    
    return df, enc, scaler, categorical_columns, numerical_columns

In [25]:
def transform_df_to_arrays(df: pd.DataFrame, categorical_columns: list, numerical_columns: list,target: str, scaler: sklearn.preprocessing.StandardScaler, enc: sklearn.preprocessing.OneHotEncoder ) -> (np.array, np.array):
    
    categorical_array  = enc.transform(df[categorical_columns]).toarray()
    df[numerical_columns] = scaler.transform(df[numerical_columns])
    
    numerical_array = df.drop(columns = [target] + categorical_columns).values
    join_columns = d(numerical_columns)
    join_columns.remove(target)
    
    ### Połącz macierz z danymi numerycznymi z macierzą z zaenkodowanymi zmiennymi kategorialnymi 
    X = np.concatenate([categorical_array, numerical_array], axis=1)
    ###
    y = df[target].values
    
    return X, y

def get_generator(df: pd.DataFrame, categorical_columns: list, numerical_columns: list, target: str, scaler: sklearn.preprocessing.StandardScaler, enc: sklearn.preprocessing.OneHotEncoder, number_of_splits: int =32) -> (np.array,np.array):
    
    number_of_splits = number_of_splits    
    
    for chunk in itertools.cycle(np.array_split(df, number_of_splits)):
        ### Użyj poprzednio napisanej funkcji (transform_df_to_arrays) by przetworzyć wybrany kawałek danych 
        X, y = transform_df_to_arrays(chunk, categorical_columns, numerical_columns, target, scaler, enc)
        ### 
        yield X, y
    

In [26]:
def train(dataset_path: str,codes_path: str,number_of_splits: int,test_size: float) -> np.array:
    
    print('[INFO] Loading dataset...')
    
    df = get_dataset(dataset_path, codes_path)
    
    print("[INFO] Dataset has been loaded...")
    
    columns_to_drop = 'UnitPrice PricePerTsubo TotalFloorArea Remarks TimeToNearestStation DistrictName NearestStation'.split()
    
    df, enc, scaler, categorical_columns, numerical_columns = get_encoder(df=df,target='TradePrice',columns_to_drop=columns_to_drop)
    
    print("[INFO] Encoder has been created...")
    target = 'TradePrice'
    
    ### Rozdziel zbiór danych an zbiór traningowy i testowy
    df_train, df_test = train_test_split(df, test_size=test_size, random_state=42) 
    ###
    
    print("[INFO] Data has been split...")
    
    test_generator = get_generator(df_test,categorical_columns, numerical_columns,target, scaler,enc,number_of_splits=int(number_of_splits * test_size))
    train_generator = get_generator(df_train,categorical_columns, numerical_columns,target, scaler,enc,number_of_splits=int(number_of_splits * (1-test_size)))
    
    del df
    gc.collect()

    print("[INFO] Generator has been created...")
    print("[INFO] Starting training...")
    reg = SGDRegressor()
        
    for _ in range(int(number_of_splits*(1-test_size))):
        ### Dla każdej iteracji pętli wydobądź następny batch z generatora treningowego
        X, y  = next(train_generator)
        ###
        reg.partial_fit(X, y)
            
    r2_scores = []


    for _ in range(int(number_of_splits*(test_size))):
        ### Dla każdej iteracji pętli wydobądź następny batch z generatora testowego
        X, y  = next(test_generator)
        ###
        res = reg.predict(X)
        ### Do listy r2_scores dodaj współczynnik determinacji r^2 dla przewidzianej wartość i rzeczywistej wartości przy użyciu funkcji r2_score
        r2 = r2_score(y, res)
        r2_scores.append(r2)
        ###
    
    print("[INFO] Training has finished...")
    
    return np.array(r2_scores), reg


In [27]:
dataset_path = join(base_path, "trade_prices")
codes_path = join(base_path, "prefecture_code.csv")

In [29]:
scores, model = train(dataset_path, codes_path, number_of_splits=3000, test_size=0.25)

In [32]:
scores.mean()

In [33]:
plt.boxplot(scores)
plt.show()