## Data Overview

In [1]:
import pandas as pd
pd.options.display.max_columns = 99

pd.read_csv('crunchbase-investments.csv', encoding='ISO-8859-1', nrows=7)

FileNotFoundError: [Errno 2] No such file or directory: 'crunchbase-investments.csv'

## Missing Columns value counts

In [None]:
crunch_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
missing_count = []
for chunk in crunch_iter:
    missing_count.append(chunk.isnull().sum())
    
combined_missing = pd.concat(missing_count)
unique_missing = combined_missing.groupby(combined_missing.index).sum()
print(unique_missing.sort_values())

## Memory Footprints

In [None]:

crunch_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1')
memory_footprint = []
for chunk in crunch_iter:
    memory_footprint.append(chunk.memory_usage(deep=True))

memory_combined = pd.concat(memory_footprint)
memory_col = memory_combined.groupby(memory_combined.index).sum()/(1024*1024)
print(memory_col.round(4))
print('\nTotal Memory (MB): ', memory_col.sum().round(4))

## Drop Unnecessary Columns

In [None]:
# Drop columns representing URL's or containing way too many missing values (>90% missing)
drop_cols = [
'company_permalink',
'company_region',
'investor_permalink',
'investor_region',
'investor_category_code',
'investor_category_code',
'investor_category_code',
'funded_month',
'funded_quarter',
'funded_year']
keep_cols = chunk.columns.drop(drop_cols)
keep_cols.to_list()


## Default Column Types

In [None]:
first_5000 = pd.read_csv('crunchbase-investments.csv', encoding='ISO-8859-1', usecols=keep_cols, nrows=5000)
first_5000.dtypes

## Inspect string columns candidate for categoty type

In [None]:
chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols)
unique_dict = {}
total_row = 0

for chunk in chunk_iter:
    total_row += chunk.shape[0]
    chunk_object_cols = chunk.select_dtypes(include=['object']).columns
    for col in chunk_object_cols:
        uniq = chunk[col].value_counts()
        if col in unique_dict:
            unique_dict[col].append(uniq)
        else:
            unique_dict[col] = [uniq]
    
print('column_name' +'\t\t\t'+ 'unique_values' +'\t'+ '% unique')
print('==================================================')
for col in unique_dict:
    uniq_series = pd.concat(unique_dict[col])
    unique_sum = uniq_series.groupby(uniq_series.index).sum()
    unique_dict[col] = len(unique_sum)
    #if len(unique_sum) < 50: #/42538 < 0.5:
    print((col +'\t'+ str(unique_dict[col])).expandtabs(32) +'\t\t'+ str(round(unique_dict[col]/total_row*100,2)))

## Convert Columns to category and date type.

- Convert columns that have limited unique values to category type include:
`company_category_code`
`company_country_code` Only have 2 unique values
`company_state_code`
`investor_country_code`
`investor_state_code`
`funding_round_type`

- convert `funded_at` column to date type.

In [None]:
col_types = {
    "company_category_code": "category", 
    "company_country_code": "category",
    "company_state_code": "category", 
    "investor_country_code": "category", 
    "investor_state_code": "category",
    "funding_round_type": "category",
}

chunk_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols, dtype=col_types, parse_dates=["funded_at"])

## Inspect Columns

In [None]:
pd.read_csv('crunchbase-investments.csv', encoding='ISO-8859-1', usecols=keep_cols, nrows=7)

In [None]:
crunch_iter = pd.read_csv('crunchbase-investments.csv', chunksize=5000, encoding='ISO-8859-1', usecols=keep_cols)
is_same = []
for chunk in crunch_iter:
    if (chunk.investor_region==chunk.investor_city):
        pass
    else:
        print(chunk.investor_region, chunk.investor_city)
    

    