I've made these notebooks for myself while studying Machine Learning.

While most of the times i will not be using 100% exactly how these functions are defined here. These are good starting templates and reminders on how things work.

I hope this can be useful for you too!

[- José H](https://github.com/dev-joseh)

# Part 1: Exploratory Data Analysis CheatSheet

**Get an understanding for which variables are important, view summary statistics, and visualize the data.**

### Import libraries and dataset

In [None]:
# Common libraries
import pandas as pd
import numpy as np
import seaborn as sns

# Some seaborn functions have FutureWarnings that can be suppressed
from warnings import simplefilter
simplefilter(action='ignore', category=FutureWarning)

# Importing and saving Datasets
# CSV
df = pd.read_csv("dataframe.csv", , header=None, nrows=5)
df.to_csv("dataframe.csv")

# Excel
pd.read_excel("dataframe.xlsx")
df.to_excel("dataframe.xlsx", sheet_name="Sheet1")

### Basic information

In [None]:
df.info() # Info on DataFrame (data types, quantity)
df.head(n) # Returns the first n rows
df.shape # (rows,columns)
df.index # Describe index
df.columns # Describe DataFrame columns
df.count() # Number of non-NA value
df.describe() # Summary statistics
df.corr(numeric_only=True) # Returns a correlation table between each attribute

### GroupBy function and methods

This function is very important for Data Analysis.

It is used to compare column values by different angles (or classes), grouping them by different attributes.

In [None]:
groupBy = df.groupby('column_to_be_grouped_by')

In [None]:
groupBy['column_to_be_checked']
groupBy.describe() # All comparisons at the same time
groupBy.sum()
groupBy.mean(numeric_only=True)
groupBy.count()

### Common plotting functions for EDA with pyplot

In [None]:
import matplotlib.pyplot as plt
%matplotlib inline

In [None]:
def plot_hist(data,bin_no=30):
    """
    Plots a histogram for the given dataframe.
    :dataframe: pandas.Dataframe
    :bin_no: int - Defines bin amount
    """
    fig, ax = plt.subplots()
    
    ax.hist(data,color='cornflowerblue',bins=bin_no)
    plt.show()

In [None]:
def plot_class_counts(data, plot_title="Title"):
    """
    Plots a countplot with a legend. Used for class count visualization.
    :data: pd.Series
    :plot_title: string - Defines plot title
    """
    # Packs each class and its count
    unique_classes, class_counts = np.unique(data.values, return_counts=True)

    # Picks the colormap for the classes
    my_cmap = plt.get_cmap("rainbow")
    rescale = lambda y: (y - np.min(y)) / (np.max(y) - np.min(y))

    fig, ax = plt.subplots()
    ax.bar(unique_classes, class_counts, label=unique_classes, color=my_cmap(rescale(class_counts)))
    ax.set_ylabel('count')
    ax.set_title(plot_title)
    ax.legend(title="Legend")
    
    plt.show()

In [None]:
def plot_heatmap(dataframe,size=0,show_values=False):
    """
    Plots a heatmap for the given dataframe.
    :dataframe: pandas.Dataframe
    :size: int - Defines plot size
    :show_values: bool - Shows values for each correlation
    """

    # Optionally resize the image
    if size:
        plt.figure(figsize=(size,size))

    # Builds the correlation heatmap
    data = dataframe.corr(numeric_only=True)
    plt.title('Correlation Heatmap')
    heatmap=plt.imshow(data, cmap='hot', interpolation='nearest', aspect='auto')

    # Adds a legend sidebar
    plt.colorbar(heatmap, label='Correlation')

    # Adds the labels
    plt.xticks(ticks=np.arange(len(data.columns)), labels=data.columns, rotation=45, ha='right')
    plt.yticks(ticks=np.arange(len(data.columns)), labels=data.columns)

    # Add numbers to each square
    for i in range(len(data.columns)):
        for j in range(len(data.columns)):
            c='white'
            if data.iloc[i, j] > 0.0:
                c='black'
            plt.text(j, i, f'{data.iloc[i, j]:.2f}', ha='center', va='center', color=c, size=7+size*0.5)

    plt.tight_layout()
    
    plt.show()

In [None]:
def plot_corr(dataframe,y,drop_val=[]):
    """
    Plots the correlation of every other attribute with y.
    :dataframe: pandas.Dataframe
    :y: string - Defines target attribute to compare with others
    :drop_val: array of strings - other values to drop
    """
    all_drop_values = [y]
    for x in drop_val:
        all_drop_values.append(x)
    correlation = dataframe.corr(numeric_only=True)[y].drop(all_drop_values).sort_values()
    correlation.plot(kind='bar')
    plt.xticks(rotation=45, ha='right')
    plt.grid(axis = 'y')

In [None]:
def plot_scatter(x,y,alpha=1):
    """
    Plots a scatterplot
    :dataframe: pandas.Dataframe
    :x,y: float or array-like
    :alpha: float - Marker transparency
    """
    fig, ax = plt.subplots()
    plt.scatter(x,y,edgecolors='white',alpha=alpha)
    ax.set_xlabel(x.name)
    ax.set_ylabel(y.name)
    plt.show()

### Seaborn plotting functions

I usually enjoy defining plots from scratch, but Seaborn plots are easy to implement and pretty useful.

In [None]:
sns.histplot()
sns.countplot() # Can reorder the bars
sns.boxplot()

### Data Transformation Functions

In [1]:
def str_map(dataframe,y):
    """
    Maps every string to an integer in given dataframe column.
    :dataframe: pandas.Dataframe
    :y: string - the column name
    :return: the new dataframe and its mapping
    """
    new_df = dataframe
    mapping = {value: i for i, value in enumerate(dataframe[y].unique())}

    # Applying the map to convert strings to integers
    new_df[y] = new_df[y].map(mapping)

    return new_df,mapping

In [None]:
def str_to_cat(dataframe,y):
    """
    Transforms y string attributes into categorical columns 
    (EX: if y has 3 possible values, creates 3 new columns for each
    with value 1 if given instance has that attribute)
    :dataframe: pandas.Dataframe
    :y: string or array of strings - which columns will be replaced
    :return: the dataframe transformed
    """
    dummies = pd.get_dummies(df[y], dtype='int32',drop_first=True)
    merged = pd.concat([df,dummies],axis='columns')
    merged.drop(y,axis='columns',inplace=True)
    
    return merged