In [1]:
import pandas_profiling as pp
import pandas as pd
from pandas import read_csv
from sklearn.preprocessing import OneHotEncoder, StandardScaler
import ast

In [2]:
NUMERICAL_FEATURES = [
    'year',
    'floor',
    'rooms',
    'total_area',
    'living_area',
    'longitude',
    'latitude',
    'description',
    'image_urls'
]

BOOLEAN_FEATURES = [
    'price_verification',
    'apartment_verification'
]

CATEGORIAL_FEATURES = [
    'heating',
    'walls',
    'region',
    'city',
]

TARGET = 'price'

SUBSTITUTE_FEATURES = [
    'description',
    'image_urls'
]

USELESS_FEATURES = [
    'title',
    'seller',
    'street',
    'publish_date',
    'offer_id',
    'apartment_id'
]


In [3]:
def extract_X_y(
        df,
        target
):
    """
    Get numpy arrays from pandas data frame
    :type df: pd.DataFrame
    :type target: str
    """
    return np.array(df.loc[:, df.columns != target]), np.array(df.loc[:, df.columns == target])

In [4]:
def load_dataset(
        csv,
        target,
        categorial_features,
        boolean_features,
        numerical_features,
        useless_features,
        substitute_features
) -> tuple:
    """
    Load the dataset, preprocess it, split it in X, y numpy variables
    :type csv: str
    :type target: str
    :type categorial_features: list
    :type boolean_features: list
    :type numerical_features: list
    :type useless_features: list
    :type substitute_features: list
    """
    print("Collecting the data from csv file...")
    df = pd.read_csv(csv)
    print("Substituting the features...")
    df = substitute(
        df=df,
        substitute_features=substitute_features
    )
    print("Dropping the useless features...")
    df.drop(
        columns=useless_features,
        inplace=True
    )
    print("Filling the NA values...")
    df.fillna(
        value=df.mean(),
        inplace=True
    )
    print("Scaling the numeric features...")
    df = scale_features(
        df=df,
        numeric_features=numerical_features
    )
    print("Performing one-hot-encoding...")
    df = pd.get_dummies(
        data=df,
        columns=(categorial_features + boolean_features),
    )
#     print("Extracting the X, y features from the pandas DataFrame object...")
#     X, y = extract_X_y(
#         df=df,
#         target=target
#     )
    return df

In [None]:
def scale_features(
        df,
        numeric_features
) -> pd.DataFrame:
    """
    Scale the numerical features and return the pandas data frame with that modifications
    :type df: pd.DataFrame
    :type numeric_features: list
    """
    scaled_features = df[numeric_features]
    scaled_features = StandardScaler() \
        .fit_transform(scaled_features)
    df[numeric_features] = scaled_features
    return df

In [None]:
def substitute(
        df,
        substitute_features
) -> pd.DataFrame:
    """
    Substitute features with len property
    :type df: pd.DataFrame
    :type substitute_features: list
    """
    for feature in substitute_features:
        df[feature] = df[feature].map(lambda value: feature_to_len(feature, value))
    return df

In [None]:
def feature_to_len(
        feature,
        value
):
    """
    Extract the length of the feature
    :type feature: object
    """
    if not isinstance(value, object) or pd.isna(value) or pd.isnull(value):
        return 0
    if feature == 'description':
        return len(str(value))
    if feature == 'image_urls':
        value = str(value)
        value = ast.literal_eval(value)
        return len(value)
    return 0

In [None]:
df = read_csv('data.csv')

In [None]:
df = load_dataset(
    csv='data.csv',
    target=TARGET,
    categorial_features=CATEGORIAL_FEATURES,
    boolean_features=BOOLEAN_FEATURES,
    numerical_features=NUMERICAL_FEATURES,
    useless_features=USELESS_FEATURES,
    substitute_features=SUBSTITUTE_FEATURES
)

Collecting the data from csv file...
Substituting the features...
Dropping the useless features...
Filling the NA values...


In [None]:
pp.ProfileReport(df)