<div class="alert alert-block alert-info" style="margin-top: 20px">
<b>Useful Function for Python Projects</b>
</div>

This is a notebook for some useful function to speed up development of a project

### Study General Information about the dataset

In [None]:
# function to determine if columns in file have null values
def get_percent_of_na(df, num):
    count = 0
    df = df.copy()
    s = (df.isna().sum() / df.shape[0])
    for column, percent in zip(s.index, s.values):
        num_of_nulls = df[column].isna().sum()
        if num_of_nulls == 0:
            continue
        else:
            count += 1
        print('Column {} has {:.{}%} percent of Nulls, and {} of nulls'.format(column, percent, num, num_of_nulls))
    if count != 0:
        print("\033[1m" + 'There are {} columns with NA.'.format(count) + "\033[0m")
    else:
        print()
        print("\033[1m" + 'There are no columns with NA.' + "\033[0m")
        
# function to display general information about the dataset
def get_info(df):
    print("\033[1m" + '-'*100 + "\033[0m")
    print('Head:')
    print()
    display(df.head())
    print('-'*100)
    print('Info:')
    print()
    display(df.info())
    print('-'*100)
    print('Describe:')
    print()
    display(df.describe())
    print('-'*100)
    display(df.describe(include='object'))
    print()
    print('Columns with nulls:')
    display(get_percent_of_na(df, 4))  # check this out
    print('-'*100)
    print('Shape:')
    print(df.shape)
    print('-'*100)
    print('Duplicated:')
    print("\033[1m" + 'We have {} duplicated rows.\n'.format(df.duplicated().sum()) + "\033[0m")
    print()

### Data Wrangling and Preprocessing

The following is some useful function in python

In [None]:
# apply numpy vectorizing to id
get_id = np.vectorize(lambda x: re.sub('.*_', '' ,  x)) # this changes 'id' from say 1000_35 to 35

In [None]:
# function to convert column names to lowercase
def lowercase_columns(df):
    return df.rename(str.lower, axis='columns')

df = lowercase_columns(df)
print(df.columns)    

In [None]:
# function to calculate the percentage of missing values
def missing_values_table(df):
        # Total missing values
        mis_val = df.isnull().sum()
        
        # Percentage of missing values
        mis_val_percent = 100 * df.isnull().sum() / len(df)
        
        # Make a table with the results
        mis_val_table = pd.concat([mis_val, mis_val_percent], axis=1)
        
        # Rename the columns
        mis_val_table_ren_columns = mis_val_table.rename(
        columns = {0 : 'Missing Values', 1 : '% of Total Values'})
        
        # Sort the table by percentage of missing descending
        mis_val_table_ren_columns = mis_val_table_ren_columns[
            mis_val_table_ren_columns.iloc[:,1] != 0].sort_values(
        '% of Total Values', ascending=False).round(1)
        
        # Print some summary information
        print ("Your selected dataframe has " + str(df.shape[1]) + " columns.\n"      
            "There are " + str(mis_val_table_ren_columns.shape[0]) +
              " columns that have missing values.")
        
        # Return the dataframe with missing information
        return mis_val_table_ren_columns

This function helps to fill in missing values with unique values in a column

In [None]:
# function to assign random value from unique values in column
def fill_in_name(df):
    unique_name = df['name'].unique()
    """
    lets remove nan values from this list using the property in python, nan != nan 
    iterate through all the values of the list, check to see if the value is equal to itself
    """
    unique_name = list(filter(lambda x: x==x, unique_name))
    # assign random name to our missing value
    df['name'].fillna(random.choice(unique_name), inplace=True)
    
def fill_in_genre(df):
    unique_genre = df['genre'].unique()
    unique_genre = list(filter(lambda x: x==x, unique_genre))
    # assign random genre to our missing value
    df['genre'].fillna(random.choice(unique_genre), inplace=True)

This function helps to convert data to the correct type

In [None]:
# convert data to the correct data type
def convert_to_type(df, cols, type_val):
    for col in cols:
        df[col] = df[col].astype(type_val)
        
convert_to_type(df, ['name', 'platform', 'genre', 'rating'], str)
convert_to_type(df, ['year_of_release', 'user_score'], int)    # use after removing NaNs

In dealing with datetime, we may need to create new feature from a time stamp such as day, month and year. The function does that.

In [7]:
# change date type to datetime and split into day, month and year
def new_date_features(df):
    columns = df.columns.tolist()
    idx = [columns.index(x) for x in columns if 'date' in x][0]
    
    df[columns[idx]] = pd.to_datetime(df[columns[idx]])
    df['day'] = df[columns[idx]].dt.day_name()
    df['month'] = df[columns[idx]].dt.month_name()
    df['year'] = df[columns[idx]].dt.year
    return df;    

When viewing a report, we may want to display two tables side by side. The function below can help us do that.

In [8]:
# function to display tables side by side for analysis
from IPython.display import display_html
from itertools import chain,cycle
def display_side_by_side(*args,titles=cycle([''])):
    html_str=''
    for df,title in zip(args, chain(titles,cycle(['</br>'])) ):
        html_str+='<th style="text-align:center"><td style="vertical-align:top">'
        html_str+=f'<h6>{title}</h6>'
        html_str+=df.to_html().replace('table','table style="display:inline"')
        html_str+='</td></th>'
    display_html(html_str,raw=True)

### EDA

The function below automates the process of plotting histograms.

In [None]:
# populate the list of numeric and categorical attributes
num_list = []
cat_list = []
plot_data = agg_df[['calls made', 'call duration', 'messages sent', 'mb used', 'plan', 'call cost', 'gb cost', 'message cost', 'revenue']]

for column in plot_data:
    if is_numeric_dtype(plot_data[column]):
        num_list.append(column)
    elif is_string_dtype(plot_data[column]):
        cat_list.append(column)
        
print(num_list)
print(cat_list)

In [None]:
# create histogram and bar chart
for column in plot_data:
    plt.figure(column)
    plt.xlabel(column)
    plt.ylabel('frequency')
    plt.title('Histogram of ' + column)
    if is_numeric_dtype(plot_data[column]):
        agg_df[column].plot(kind = 'hist')
    elif is_string_dtype(plot_data[column]):
        agg_df[column].value_counts().plot(kind = 'bar')