# Clean Raw Data #
This notebook cleans the raw data to prepare it for modeling.

In [None]:
import pickle
import pandas as pd
import numpy as np

### Import raw data from SQL join ###

In [None]:
raw_join = pickle.load(open('../pickle_jar/raw_join.p','rb'))

### Drop unwanted columns ###

In [None]:
# some are duplicates
cols_to_drop = ['index','url','dc_slug','thumbnail_url','upload_date','canonical_committeeid',
               'slug','from_pactrack','freed_count','pactrack','state_id','date_coverage_from','end_date',
               'fec_uri','interest_group','date_coverage_to','next_filing_date','name','party','committeeid',
               'id','gross_amount','super_pac']

for col in cols_to_drop:
    raw_join.drop(col,axis=1,inplace=True)

### Drop committee types represented by fewer than 10% of filings to avoid anomaly detection ###

In [None]:
committee_types_to_drop = ['Q','N','V','W']

for i in raw_join.index:
    if raw_join.loc[i,'committee_type'] in committee_types_to_drop:
        raw_join.drop(i,axis=0,inplace=True)

### Convert monetary amounts to floats ###

In [None]:
monetary_cols = ['total_individual_unitemized','total_disbursements',
                'total_candidate_contributions','total_pac_refunds','total_from_individuals','total_individual_refunds',
                'total_from_pacs','debts_owed','total_reciepts','total_contributions','transfers_in',
                'total_independent_expenditures','ad_gross_amount','committee_gross_amount']

for col in monetary_cols:
    raw_join[col] = raw_join[col].apply(lambda x: float(x.strip('$').replace(',','')) if type(x) == str else x)

In [None]:
pickle.dump(raw_join,open('../pickle_jar/raw_join_pared.p','wb'))

### Remove multiple filings for same contract ###
Several contracts have multiple filings listed--apparently one for each ad in the contract, as some contracts include terms for multiple ads. The 'ad_gross_amount' field represent the total sum of the subsequent filings, so for a group of filings under the same contract I kept the one with the largest 'ad_gross_amount,' which should represent the total amount for the contract. 

In [None]:
max_amounts = {}
for group in raw_join.groupby('contract_number')['ad_gross_amount']:
    max_amounts[group[0]] = max(group[1]) 

In [None]:
keep_indicies = []
for k,v in max_amounts.items():
    mask = ((raw_join.contract_number == k) & (raw_join.ad_gross_amount == v))
    keep_indicies.append(raw_join[mask].index[0])

### Pickle cleaned dataframe ###

In [None]:
clean_join = raw_join.loc[keep_indicies,:].copy(deep=True)
pickle.dump(clean_join,open('../pickle_jar/clean_join.p','wb'))