### Analyze the Crunchbase data set while keeping a memory profile of below 10MB of memory


In [21]:
import pandas as pd
import numpy as np

c_iter = pd.read_csv("crunchbase-investments.csv", chunksize = 5000, encoding='ISO-8859-1')

### Find each column's missing value count:

In [33]:
c_iter = pd.read_csv("crunchbase-investments.csv", chunksize = 5000, encoding="ISO-8859-1")
null_list = []

for c in c_iter:
    null_list.append(c.isnull().sum())

joined_list = pd.concat(null_list)
grouped_list = joined_list.groupby(joined_list.index).sum()
grouped_list.sort_values()


company_country_code          1
company_name                  1
company_permalink             1
company_region                1
investor_region               2
investor_permalink            2
investor_name                 2
funded_quarter                3
funded_at                     3
funded_month                  3
funded_year                   3
funding_round_type            3
company_state_code          492
company_city                533
company_category_code       643
raised_amount_usd          3599
investor_country_code     12001
investor_city             12480
investor_state_code       16809
investor_category_code    50427
dtype: int64

### Find each column's memory footprint:

In [37]:
c_iter = pd.read_csv("crunchbase-investments.csv", chunksize = 5000, encoding="ISO-8859-1")
mf_list = []

for c in c_iter:
    mf_list.append(c.memory_usage(deep=True) / 1024 ** 2 )
joined_list = pd.concat(mf_list)
grouped_list = joined_list.groupby(joined_list.index).sum()
grouped_list.sort_values()

Index                     0.000877
funded_year               0.403366
raised_amount_usd         0.403366
investor_category_code    0.593590
investor_state_code       2.361876
investor_country_code     2.524654
investor_city             2.751430
company_state_code        2.962161
company_country_code      3.025223
funded_quarter            3.226837
funded_month              3.226837
investor_region           3.238946
funding_round_type        3.252704
company_region            3.253541
company_category_code     3.262619
company_city              3.343512
funded_at                 3.378091
company_name              3.424955
investor_name             3.734270
company_permalink         3.869808
investor_permalink        4.749821
dtype: float64

### Find total memory footprint of all chunks combined:

In [26]:
c_iter = pd.read_csv("crunchbase-investments.csv", chunksize=5000, encoding="ISO-8859-1")
total_memory = 0
for c in c_iter:
    total_memory += c.memory_usage(deep=True).sum() / 1024 ** 2
total_memory

56.988484382629395

### Find total rows in data set:

In [38]:
c_iter = pd.read_csv("crunchbase-investments.csv", chunksize=5000, encoding="ISO-8859-1")
rowcount = 0
for c in c_iter:
    rowcount += c.size
rowcount

1057400

### Which columns can be dropped:
#### The investor_permalink and company_permalink columns aren't really necessary for analysis and they take up a relatively large amount of memory. I would drop those 2 columns to start. Next would be the investor_category_code as it is missing a large amount of data.


In [42]:
drop_cols = ["investor_permalink", "company_permalink", "investor_category_code"]
keep_cols = c.columns.drop(drop_cols)
keep_cols.tolist
print(keep_cols)
c_iter = pd.read_csv("crunchbase-investments.csv", chunksize=5000, encoding="ISO-8859-1", usecols=keep_cols)

Index(['company_name', 'company_category_code', 'company_country_code',
       'company_state_code', 'company_region', 'company_city', 'investor_name',
       'investor_country_code', 'investor_state_code', 'investor_region',
       'investor_city', 'funding_round_type', 'funded_at', 'funded_month',
       'funded_quarter', 'funded_year', 'raised_amount_usd'],
      dtype='object')


### Identify the types of each column:

In [43]:
c.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2870 entries, 50000 to 52869
Data columns (total 20 columns):
company_permalink         2870 non-null object
company_name              2870 non-null object
company_category_code     2860 non-null object
company_country_code      2870 non-null object
company_state_code        2841 non-null object
company_region            2870 non-null object
company_city              2825 non-null object
investor_permalink        2870 non-null object
investor_name             2870 non-null object
investor_category_code    0 non-null float64
investor_country_code     0 non-null float64
investor_state_code       0 non-null float64
investor_region           2870 non-null object
investor_city             0 non-null float64
funding_round_type        2870 non-null object
funded_at                 2870 non-null object
funded_month              2870 non-null object
funded_quarter            2870 non-null object
funded_year               2870 non-null int64
rais