In [None]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# `Raw Data`

In [None]:
data1 = pd.read_csv('data/measurements.csv')

print(f"Shape is: {data1.shape}")
data1.head()

In [None]:
data2 = pd.read_excel('data/measurements2.xlsx')

print(f"Shape is: {data2.shape}")
data2.head()

# `Concat Data`

In [None]:
data = pd.concat([data1, data2]).drop_duplicates()
data.columns = data.columns.str.lower().str.replace(' ', '_')

print(f"Shape is: {data.shape}")
data.head()

In [None]:
data.isna().sum()

# `Cleaned Data`

In [None]:
def display_types_and_nan(df):
    display([f'{col}: {df[col].dtype} | {df[col].isna().sum()}' for col in df.columns])

def fix_float_col(cell):
    if cell == None:
        return np.nan
    if type(cell) == float:
        return cell
    else:
        return cell.replace(',', '.')
    
def transform_and_fill_nan_with_mean(df, col):
    df[col] = df[col].apply(fix_float_col).astype(float)
    mean = df[col].mean()
    return df[col].fillna(mean)

In [None]:
display_types_and_nan(data)

In [None]:
data.drop(['refill_liters', 'refill_gas'], axis=1, inplace=True)
display_types_and_nan(data)

In [None]:
data['distance'] = transform_and_fill_nan_with_mean(data, 'distance')
data['consume'] = transform_and_fill_nan_with_mean(data, 'consume')
data['temp_inside'] = transform_and_fill_nan_with_mean(data, 'temp_inside')
display_types_and_nan(data)

In [None]:
data['snow'] = data['specials'].str.contains('snow')
data['snow'].fillna('Unknown', inplace=True)
display_types_and_nan(data)

In [None]:
data.drop(['specials'], axis=1, inplace=True)
display_types_and_nan(data)

In [None]:
data.to_csv('data/cleaned_data_visualization.csv', index=False)

# `Check correlations`

In [None]:
def display_heatmap(dataframe):
    corr=dataframe.corr()

    mask=np.triu(np.ones_like(corr, dtype=bool))     # generate a mask for the upper triangle

    f, ax=plt.subplots(figsize=(11, 9))                 # set up the matplotlib figure

    cmap=sns.diverging_palette(220, 10, as_cmap=True)   # generate a custom diverging colormap

    sns.heatmap(corr, mask=mask, cmap=cmap,             # draw the heatmap with the mask and correct aspect ratio
                vmax=.3, center=0, square=True,
                linewidths=.5, cbar_kws={"shrink": .5})

In [None]:
replace_dict_snow = {False: 0, True: 1, 'Unknown': -1}
data['snow'] = data['snow'].map(replace_dict_snow)

replace_dict_gas_type = {'E10': 0, 'SP98': 1}
data['gas_type'] = data['gas_type'].map(replace_dict_gas_type)

data.to_csv('data/cleaned_data_modeling.csv', index=False)

In [None]:
display_heatmap(data)