## Python Preprocessing Data

In [None]:
import pandas as pd
from matplotlib import pyplot as plt

In [None]:
## Read data

df = pd.read_csv('data.csv', delimiter=',', header='infer')

In [None]:
## Info from data

df.index

df.columns

df.describe()

df.dtypes

df.info()

df.value_counts()

df.nunique()

In [None]:
## Missing Data

# Remove any rows with missing data
df = df.dropna(axis=0, how='any')

# Fill all NA with a constant value
df.fillna(value=5)

# Get the Boolean where values are NA
pd.isna(df)

In [None]:
## Outliers



# Filtering data from a Colum in the Dataframe
df_filtered = df[df['Column']>1]

In [None]:
## Correlations

# Correlation between two variables in a DataFrame
df.Var1.corr(df.Var2)

# Scatter Plot between two numeric vars
df.plot.scatter(title='Var1 vs Var2 Tend', x='Var1', y='Var2')

In [None]:
## Graph with Histogram & Boxplot of a variable from data (Like Front - Plant)

def show_distribution(var_data):
    '''
    This function will make a distribution (graph) and display it
    '''

    # Get statistics
    min_val = var_data.min()
    max_val = var_data.max()
    mean_val = var_data.mean()
    med_val = var_data.median()
    mod_val = var_data.mode()[0]

    print('Minimum:{:.2f}\nMean:{:.2f}\nMedian:{:.2f}\nMode:{:.2f}\nMaximum:{:.2f}\n'.format(min_val,
                                                                                            mean_val,
                                                                                            med_val,
                                                                                            mod_val,
                                                                                            max_val))

    # Create a figure for 2 subplots (2 rows, 1 column)
    fig, ax = plt.subplots(2, 1, figsize = (10,4))

    # Plot the histogram   
    ax[0].hist(var_data)
    ax[0].set_ylabel('Frequency')

    # Add lines for the mean, median, and mode
    ax[0].axvline(x=min_val, color = 'gray', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mean_val, color = 'cyan', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=med_val, color = 'red', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=mod_val, color = 'yellow', linestyle='dashed', linewidth = 2)
    ax[0].axvline(x=max_val, color = 'gray', linestyle='dashed', linewidth = 2)

    # Plot the boxplot   
    ax[1].boxplot(var_data, vert=False)
    ax[1].set_xlabel('Value')

    # Add a title to the Figure
    fig.suptitle('Data Distribution')

    # Show the figure
    fig.show()
