In [1]:
# Load Dependencies
import os
import pandas as pd
import numpy as np
import itertools
from IPython.display import Markdown
from IPython.display import display
from ipywidgets import IntProgress
import time


DEBUG = False

# Load Data
df = pd.read_csv('data/dummied.csv.gz', compression='gzip')

# Select columns for feature engineering
exclude = ['account_id', 'is_current_customer', 'is_self_service', 'country', 'state', 'country_top', 'state_top', 'industry_grouped', 'company_revenue_bucket', 'dnb_founded_time_grouped']

df['account_id'] = df['account_id'].astype('object')

float64_columns = df.select_dtypes(include='float64').columns
df[float64_columns] = df[float64_columns].astype('int')



In [2]:
if DEBUG:
    df.info()
    df.head()
    # Count all the things
    display(Markdown("# Boolean (bool)"))
    display(df.select_dtypes(include=['bool']).apply(pd.Series.value_counts, dropna=False))

    display(Markdown("# Boolean with Missing (int64)"))
    display(df.select_dtypes(include=['int64']).apply(pd.Series.value_counts, dropna=False))

    display(Markdown("# Categories (object)"))
    for col in df.select_dtypes(include=['object']).columns:
        display(df[col].value_counts(dropna=False).to_frame())

In [3]:
int64_columns = df.select_dtypes(include='int64').columns

# Remove 'is_self_service' and 'is_arr_over_12k'
int64_columns = int64_columns.drop(['is_self_service', 'is_arr_over_12k'])

display(len(int64_columns))
display(len(list(itertools.combinations(int64_columns, 2))))

if DEBUG:
    int64_columns.to_frame().to_csv('data/int64_columns.csv', index=False)

145

10440

In [4]:

count = 0
max_count = len(list(itertools.combinations(int64_columns, 2)))

f = IntProgress(min=0, max=max_count) # instantiate the bar
display(f) # display the bar

new_columns = []

for col1, col2 in itertools.combinations(int64_columns, 2):
    new_columns.append(pd.DataFrame({
        f'{col1}_plus_{col2}': df[col1] + df[col2],
        f'{col1}_max_{col2}': df[[col1, col2]].max(axis=1)
    }))
    f.value += 1 

# df = pd.concat([df] + new_columns, axis=1)


IntProgress(value=0, max=10440)

In [5]:
# Correlate new columns with `is_arr_over_12k`
correlations = pd.Series(index=df.columns)
for col in new_columns:
    correlations[col.columns[0]] = df['is_arr_over_12k'].corr(col[col.columns[0]])

# Sort correlations
top100 = correlations.sort_values(ascending=False).index.tolist()[:100]



In [6]:
# Concatenate top 100 new columns to dataframe
filtered_columns = [col for col in new_columns if col.columns[0] in top100]

df = pd.concat([df] + filtered_columns, axis=1)

In [7]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 509599 entries, 0 to 509598
Columns: 400 entries, account_id to crossbeam_product23_customer_max_hg_product_129
dtypes: bool(45), int64(347), object(8)
memory usage: 1.4+ GB


In [8]:
# Save!
df.to_csv('data/feature_engineered.csv.gz', compression='gzip', index=False)