In [1]:
import pandas as pd
import numpy as np
import sqlite3
import chardet
import math

In [2]:
#doesn't actually save any data in this dataset
# need to manually chunk and modify each chunk
def chunk_and_save(csv_name, mb_limit=5, encoding=None):
    if encoding is None:
        with open(csv_name, 'rb') as f:
            result = chardet.detect(f.read(5000))
        encoding = result['encoding']
        print(f'Found encoding {encoding}')
    
    #Select chunck size close to memory limit
    df = pd.read_csv(csv_name, nrows=1000, encoding=encoding)
    mem_1000 = df.memory_usage(deep=True).sum()/2**20
    chunk_size = math.floor(1000 * (mb_limit / mem_1000))
    
    # Initial column looks
    perc_cols = find_str_percentiles(df)
    
    
    # Use chunk size to parse
    chunk_iter = pd.read_csv(csv_name, encoding=encoding, chunksize=chunk_size)
    processed_chunks = []
    for chunk in chunk_iter:
        chunk[perc_cols] = chunk[perc_cols].apply(lambda L: L.str.rstrip('%').astype('float') / 100.0, axis=1)
        # Convert numeric columns to more efficient representations
        for col in chunk.select_dtypes(include=[np.number]).columns:
            col_min = chunk[col].min()
            col_max = chunk[col].max()
            if pd.api.types.is_float_dtype(chunk[col]):
                chunk[col] = pd.to_numeric(chunk[col], downcast='float')
            elif pd.api.types.is_integer_dtype(chunk[col]):
                chunk[col] = pd.to_numeric(chunk[col], downcast='integer')

        # Append the processed chunk to the list
        processed_chunks.append(chunk)

    # Concatenate all processed chunks into a single DataFrame
    processed_df = pd.concat(processed_chunks, ignore_index=True)
    
    return processed_df

def find_str_percentiles(df: pd.DataFrame) -> [str]:
    # Convert columns with '%' to numeric
    percentile_cols = []
    for col in df.columns:
        if df[col].dtype == 'object':  # Check if the column is of object type
            if df[col].str.endswith('%').any():
                percentile_cols.append(col)
    return percentile_cols


In [3]:
# df = chunk_and_save('crunchbase-investments.csv', 10, 'ISO-8859-1')
#Select chunck size close to memory limit
csv_name = 'crunchbase-investments.csv'
encoding = 'ISO-8859-1'  # had to look at solution to find this out
df_probe = pd.read_csv(csv_name, nrows=1000, encoding=encoding)
mem_1000 = df_probe.memory_usage(deep=True).sum()/2**20
chunk_size = math.floor(1000 * (10 / mem_1000))
chunk_iter = pd.read_csv(csv_name, encoding=encoding, chunksize=chunk_size)
# Total initial memory being used is 58.6 Mb

In [4]:
# Initial exploration of the data
display(df_probe.head(10))

Unnamed: 0,company_permalink,company_name,company_category_code,company_country_code,company_state_code,company_region,company_city,investor_permalink,investor_name,investor_category_code,investor_country_code,investor_state_code,investor_region,investor_city,funding_round_type,funded_at,funded_month,funded_quarter,funded_year,raised_amount_usd
0,/company/advercar,AdverCar,advertising,USA,CA,SF Bay,San Francisco,/company/1-800-flowers-com,1-800-FLOWERS.COM,,USA,NY,New York,New York,series-a,2012-10-30,2012-10,2012-Q4,2012,2000000.0
1,/company/launchgram,LaunchGram,news,USA,CA,SF Bay,Mountain View,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-23,2012-01,2012-Q1,2012,20000.0
2,/company/utap,uTaP,messaging,USA,,United States - Other,,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-01-01,2012-01,2012-Q1,2012,20000.0
3,/company/zoopshop,ZoopShop,software,USA,OH,Columbus,columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-02-15,2012-02,2012-Q1,2012,20000.0
4,/company/efuneral,eFuneral,web,USA,OH,Cleveland,Cleveland,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2011-09-08,2011-09,2011-Q3,2011,20000.0
5,/company/tackk,Tackk,web,USA,OH,Cleveland,Cleveland,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,other,2012-02-01,2012-02,2012-Q1,2012,20000.0
6,/company/acclaimd,Acclaimd,analytics,USA,OH,Columbus,Columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-06-01,2012-06,2012-Q2,2012,20000.0
7,/company/acclaimd,Acclaimd,analytics,USA,OH,Columbus,Columbus,/company/10xelerator,10Xelerator,finance,USA,OH,Columbus,Columbus,angel,2012-08-07,2012-08,2012-Q3,2012,70000.0
8,/company/toviefor,ToVieFor,ecommerce,USA,NY,New York,New York,/company/2010-nyu-stern-business-plan-competition,2010 NYU Stern Business Plan Competition,,,,unknown,,angel,2010-04-01,2010-04,2010-Q2,2010,75000.0
9,/company/ohk-labs,OHK Labs,sports,USA,FL,Palm Beach,Boca Raton,/company/22hundred-group,22Hundred Group,,,,unknown,,angel,2011-09-01,2011-09,2011-Q3,2011,100000.0


Initial impressions:
 - Drop: "company_permalink" and "investor_permalink" columns for analysis
 - Convert->Category: any column with suffix "code"
 - Convert->Category: "company_region", "investor_region", "funding_round_type"
 - Convert->Datetime: "funded_at" column
 - Drop: Funded_month, funded_year
 - Convert->Category: funded_quarter

In [6]:
cols_to_drop = ['company_permalink', 'investor_permalink', 
                'funded_month', 'funded_year']
category_cols = ['company_category_code', 'company_country_code',
                'company_state_code', 'investor_category_code',
                'investor_country_code', 'investor_state_code',
                'company_region', 'investor_region',
                 'funding_round_type', 'funded_quarter']
to_combine = []
for chunk in chunk_iter:
    chunk.drop(columns=cols_to_drop)
    chunk[category_cols] = chunk[category_cols].astype('category')
    pd.to_datetime(chunk['funded_at'])
    to_combine.append(chunk)
df = pd.concat(to_combine)

In [11]:
df.info(memory_usage='deep')

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 43873 entries, 8997 to 52869
Data columns (total 20 columns):
 #   Column                  Non-Null Count  Dtype   
---  ------                  --------------  -----   
 0   company_permalink       43872 non-null  object  
 1   company_name            43872 non-null  object  
 2   company_category_code   43320 non-null  object  
 3   company_country_code    43872 non-null  object  
 4   company_state_code      43469 non-null  object  
 5   company_region          43872 non-null  object  
 6   company_city            43441 non-null  object  
 7   investor_permalink      43871 non-null  object  
 8   investor_name           43871 non-null  object  
 9   investor_category_code  0 non-null      category
 10  investor_country_code   32830 non-null  object  
 11  investor_state_code     28982 non-null  object  
 12  investor_region         43871 non-null  object  
 13  investor_city           32515 non-null  object  
 14  funding_round_type 

FROM DATAQUEST: 
Use the pandas SQLite workflow to answer the following questions:

What proportion of the total amount of funds did the top 10% raise? What about the top 1%? Compare these values to the proportions the bottom 10% and bottom 1% raised.
Which category of company attracted the most investments?
Which investor contributed the most money (across all startups)?
Which investors contributed the most money per startup?
Which funding round was the most popular? Which was the least popular?
Here are some ideas for further exploration:

Repeat the tasks in this guided project using stricter memory constraints (under 1 megabyte).
Clean and analyze the other Crunchbase datasets from the same GitHub repo.
Understand which columns the datasets share, and how the datasets are linked.
Create a relational database design that links the datasets together and reduces the overall disk space the database file consumes.
Use pandas to populate each table in the database, create the appropriate indexes, and so on.