### Import Dependencies


In [None]:
import pandas as pd
import numpy as np
import json
import sqlalchemy
from sqlalchemy.ext.automap import automap_base
from sqlalchemy.orm import Session
from sqlalchemy import create_engine, func, inspect, desc
import sqlite3
import requests
import lxml.html as lh
from lxml.etree import tostring

### Connect to Database

In [None]:
engine = create_engine("sqlite:///Election2020Data_ew.sqlite")
conn = engine.connect()

In [None]:
# master_data = pd.read_sql_table('2020Donors', conn)
# master_data.head()

### Divide Data Into Chunks

In [None]:
dataFrames = []
for chunks in pd.read_sql_table('2020Donors', conn, chunksize = 1000000, index_col = "Record_id", columns=["Flag_Orgind","First_Name","City", "State", "Zip", "Occupation","Amount","Date","Aggregate_Amount","Cycle","Campaign"]):
    dataFrames.append(chunks)

### Display Single Chunk

In [None]:
dataFrames[0]

Unnamed: 0_level_0,Flag_Orgind,First_Name,City,State,Zip,Occupation,Amount,Date,Aggregate_Amount,Cycle,Campaign
Record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,IND,Russell,Birmingham,AL,35205,Retired,250.0,2019-03-31,250.00,2020,Kamala Harris For The People
2,IND,Cody,Mobile,AL,36695,Clerk,100.0,2019-03-31,250.00,2020,Kamala Harris For The People
3,IND,Frank,Mountain Brk,AL,35213,Retired,500.0,2019-03-31,500.00,2020,Kamala Harris For The People
4,IND,Henry,Birmingham,AL,35242,PT Consultant,25.0,2019-03-30,297.84,2020,Kamala Harris For The People
5,IND,Henry,Birmingham,AL,35242,PT Consultant,25.0,2019-03-28,297.84,2020,Kamala Harris For The People
...,...,...,...,...,...,...,...,...,...,...,...
999996,IND,Catherine,Pasadena,CA,91104,Teacher,100.0,2019-11-04,240.05,2020,"Warren For President, Inc."
999997,IND,Laura,San Francisco,CA,94158,Product Manager,100.0,2019-10-15,300.00,2020,"Warren For President, Inc."
999998,IND,Laura,San Francisco,CA,94158,Product Manager,100.0,2019-12-31,300.00,2020,"Warren For President, Inc."
999999,IND,Marie,Sacramento,CA,95831,Education Administration,50.0,2019-10-27,460.00,2020,"Warren For President, Inc."


### Display Datatypes

In [None]:
dataFrames[0].dtypes

Flag_Orgind                 object
First_Name                  object
City                        object
State                       object
Zip                         object
Occupation                  object
Amount                     float64
Date                datetime64[ns]
Aggregate_Amount           float64
Cycle                        int64
Campaign                    object
dtype: object

### Clean Occupation Data and Display Single Chunk

In [None]:
for dataframe in dataFrames:
    dataframe["Occupation"]= dataframe["Occupation"].str.lower()
    dataframe["Occupation"]= dataframe["Occupation"].str.replace('notemployed','unemployed')
    dataframe["Occupation"]= dataframe["Occupation"].str.replace('un employed','unemployed')
    dataframe["Occupation"]= dataframe["Occupation"].str.replace('self employed','owner')
    dataframe["Occupation"]= dataframe["Occupation"].str.replace('None','none')
    dataframe["Occupation"]= dataframe["Occupation"].str.replace('selfemployed','owner')
    dataframe["Occupation"]= dataframe["Occupation"].str.replace('not employed','unemployed')
dataFrames[0]

Unnamed: 0_level_0,Flag_Orgind,First_Name,City,State,Zip,Occupation,Amount,Date,Aggregate_Amount,Cycle,Campaign
Record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
1,IND,Russell,Birmingham,AL,35205,retired,250.0,2019-03-31,250.00,2020,Kamala Harris For The People
2,IND,Cody,Mobile,AL,36695,clerk,100.0,2019-03-31,250.00,2020,Kamala Harris For The People
3,IND,Frank,Mountain Brk,AL,35213,retired,500.0,2019-03-31,500.00,2020,Kamala Harris For The People
4,IND,Henry,Birmingham,AL,35242,pt consultant,25.0,2019-03-30,297.84,2020,Kamala Harris For The People
5,IND,Henry,Birmingham,AL,35242,pt consultant,25.0,2019-03-28,297.84,2020,Kamala Harris For The People
...,...,...,...,...,...,...,...,...,...,...,...
999996,IND,Catherine,Pasadena,CA,91104,teacher,100.0,2019-11-04,240.05,2020,"Warren For President, Inc."
999997,IND,Laura,San Francisco,CA,94158,product manager,100.0,2019-10-15,300.00,2020,"Warren For President, Inc."
999998,IND,Laura,San Francisco,CA,94158,product manager,100.0,2019-12-31,300.00,2020,"Warren For President, Inc."
999999,IND,Marie,Sacramento,CA,95831,education administration,50.0,2019-10-27,460.00,2020,"Warren For President, Inc."


### Display Unique Values in "Flag_Orgind" Column

In [None]:
dataFrames[0].Flag_Orgind.unique()

array(['IND', 'ORG', 'CCM', 'PAC', 'COM', 'PTY', 'CAN'], dtype=object)

### Investigate Overall Number of Non-Individual Donors 

In [None]:
non_ind_list = []
for dataframe in dataFrames:
    non_ind_df = dataframe.loc[dataframe["Flag_Orgind"] != "IND"]
    non_ind_list.append(non_ind_df)
non_ind_final = pd.concat(non_ind_list)
non_ind_final
    

Unnamed: 0_level_0,Flag_Orgind,First_Name,City,State,Zip,Occupation,Amount,Date,Aggregate_Amount,Cycle,Campaign
Record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1
133,ORG,,BIRMINGHAM,AL,35203,,500.00,2019-03-25,500.00,2020,"Donald J. Trump For President, Inc."
2857,CCM,,Huntsville,AL,35801,,100.00,2019-06-23,200.00,2020,Swalwell For America
2858,CCM,,Huntsville,AL,35801,,100.00,2019-05-23,200.00,2020,Swalwell For America
16399,PAC,,Birmingham,AL,35213,,5000.00,2019-12-31,5000.00,2020,Biden For President
25750,COM,,Birmingham,AL,35203,,250.00,2020-01-23,250.00,2020,Biden For President
...,...,...,...,...,...,...,...,...,...,...,...
9758034,ORG,,Cheyenne,WY,82001,,16820.70,2020-04-03,16820.70,2020,Tom Steyer 2020
9761061,CCM,,CASPER,WY,82605,,2000.00,2020-06-10,2000.00,2020,"Donald J. Trump For President, Inc."
9761062,CCM,,CASPER,WY,82605,,2000.00,2020-06-10,4000.00,2020,"Donald J. Trump For President, Inc."
9762333,PTY,,CHEYENNE,WY,82001,,972.10,2020-07-24,972.10,2020,"Donald J. Trump For President, Inc."


### Display Unique Values in "Campaign" Column

In [None]:
dataFrames[0].Campaign.unique()

array(['Kamala Harris For The People', 'Bernie 2020', 'Tulsi Now',
       'Donald J. Trump For President, Inc.', 'Pete For America, Inc.',
       'Beto For America', 'Friends Of Andrew Yang', 'Amy For America',
       'Warren For President, Inc.', 'Cory 2020',
       'Marianne Williamson For President', 'Hickenlooper 2020',
       'Bennet For America', 'Seth Moulton For America, Inc.',
       'Swalwell For America', 'Biden For President',
       'Tim Ryan For America',
       'Weld 2020 Presidential Campaign Committee, Inc.',
       'De Blasio 2020', 'Friends Of John Delaney',
       'Bullock For President', 'Tom Steyer 2020',
       'Julian For The Future', 'Deval For All',
       'Mike Bloomberg 2020, Inc.', 'Inslee For America',
       'Joe Sestak For President', 'Wayne Messam For America, Inc.',
       'Gillibrand 2020', 'People First Future',
       'Committee For Peace, Justice, And Mike Gravel'], dtype=object)

### Filter "Flag_Orgind" Column for Individual Donors, Append to New List, and Check Unique Values

In [None]:
cleaned_dataframes = []
for dataframe in dataFrames:
    filtered_dataframe = dataframe[dataframe["Flag_Orgind"] == "IND"]
    cleaned_dataframes.append(filtered_dataframe)
cleaned_dataframes[0].Flag_Orgind.unique()

array(['IND'], dtype=object)

### Create Lists for Campaign Type

In [None]:
democratic_campaigns = ['Kamala Harris For The People', 'Bernie 2020', 'Tulsi Now','Pete For America, Inc.','Beto For America', 'Friends Of Andrew Yang', 'Amy For America','Warren For President, Inc.', 'Cory 2020', 'Marianne Williamson For President', 'Hickenlooper 2020', 'Bennet For America', 'Seth Moulton For America, Inc.', 'Swalwell For America', 'Biden For President', 'Tim Ryan For America', 'De Blasio 2020', 'Friends Of John Delaney', 'Bullock For President', 'Tom Steyer 2020', 'Julian For The Future', 'Deval For All', 'Mike Bloomberg 2020, Inc.', 'Inslee For America', 'Joe Sestak For President', 'Wayne Messam For America, Inc.', 'Gillibrand 2020', 'People First Future', 'Committee For Peace, Justice, And Mike Gravel']
republican_campaigns = ['Donald J. Trump For President, Inc.', 'Weld 2020 Presidential Campaign Committee, Inc.']

### Create New Column for Campaign Type and Assign Values

In [None]:
for dataframe in cleaned_dataframes:
    dataframe["Campaign_Type"] = np.where(dataframe["Campaign"].isin(democratic_campaigns), "Democratic", "Republican")
cleaned_dataframes[0].Campaign_Type.unique()


A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  


array(['Democratic', 'Republican'], dtype=object)

### Verify New Values

In [None]:
democratic_test_0 = cleaned_dataframes[0].loc[cleaned_dataframes[0]["Campaign_Type"] == "Democratic"]
democratic_test_0["Campaign"].unique()

array(['Kamala Harris For The People', 'Bernie 2020', 'Tulsi Now',
       'Pete For America, Inc.', 'Beto For America',
       'Friends Of Andrew Yang', 'Amy For America',
       'Warren For President, Inc.', 'Cory 2020',
       'Marianne Williamson For President', 'Hickenlooper 2020',
       'Bennet For America', 'Seth Moulton For America, Inc.',
       'Swalwell For America', 'Biden For President',
       'Tim Ryan For America', 'De Blasio 2020',
       'Friends Of John Delaney', 'Bullock For President',
       'Tom Steyer 2020', 'Julian For The Future', 'Deval For All',
       'Mike Bloomberg 2020, Inc.', 'Inslee For America',
       'Joe Sestak For President', 'Wayne Messam For America, Inc.',
       'Gillibrand 2020', 'People First Future',
       'Committee For Peace, Justice, And Mike Gravel'], dtype=object)

In [None]:
republican_test_0 = cleaned_dataframes[0].loc[cleaned_dataframes[0]["Campaign_Type"] == "Republican"]
republican_test_0["Campaign"].unique()

array(['Donald J. Trump For President, Inc.',
       'Weld 2020 Presidential Campaign Committee, Inc.'], dtype=object)

### Scrape Gender Data and Create Gender Lists

In [None]:
# male_url = "https://namecensus.com/male_names_alpha.htm"
# male_page = requests.get(male_url)
# male_doc = lh.fromstring(male_page.content)
# male_inner_html = tostring(male_doc)
# male_tr_elements = male_doc.xpath("//tr")

# female_url = "https://namecensus.com/female_names_alpha.htm"
# female_page = requests.get(female_url)
# female_doc = lh.fromstring(female_page.content)
# female_inner_html = tostring(female_doc)
# female_tr_elements = female_doc.xpath("//tr")

# male_table_text = []
# for i in range(1, len(male_tr_elements)):
#     for male_element in male_tr_elements[i]:
#         male_cell_text = male_element.text_content()
#         male_table_text.append(male_cell_text)

# male_name_list = male_table_text[::4]
# male_name_strings = [str(item) for item in male_name_list]

# female_table_text = []
# for j in range(1, len(female_tr_elements)):
#     for female_element in female_tr_elements[j]:
#         female_cell_text = female_element.text_content()
#         female_table_text.append(female_cell_text)

# female_name_list = female_table_text[::4]
# female_name_strings = [str(item) for item in female_name_list]

### Concatenate Cleaned Dataframes

In [None]:
cleaned_dataframe = pd.concat(cleaned_dataframes)
cleaned_dataframe.head()

Unnamed: 0_level_0,Flag_Orgind,First_Name,City,State,Zip,Occupation,Amount,Date,Aggregate_Amount,Cycle,Campaign,Campaign_Type
Record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1
1,IND,Russell,Birmingham,AL,35205,retired,250.0,2019-03-31,250.0,2020,Kamala Harris For The People,Democratic
2,IND,Cody,Mobile,AL,36695,clerk,100.0,2019-03-31,250.0,2020,Kamala Harris For The People,Democratic
3,IND,Frank,Mountain Brk,AL,35213,retired,500.0,2019-03-31,500.0,2020,Kamala Harris For The People,Democratic
4,IND,Henry,Birmingham,AL,35242,pt consultant,25.0,2019-03-30,297.84,2020,Kamala Harris For The People,Democratic
5,IND,Henry,Birmingham,AL,35242,pt consultant,25.0,2019-03-28,297.84,2020,Kamala Harris For The People,Democratic


### Display Final Dataframe Length and Data Types

In [None]:
len(cleaned_dataframe)

9749263

In [None]:
cleaned_dataframe.dtypes

Flag_Orgind                 object
First_Name                  object
City                        object
State                       object
Zip                         object
Occupation                  object
Amount                     float64
Date                datetime64[ns]
Aggregate_Amount           float64
Cycle                        int64
Campaign                    object
Campaign_Type               object
dtype: object

### Confirm Filtering of Donor Type

In [None]:
cleaned_dataframe["Flag_Orgind"].unique()

array(['IND'], dtype=object)

In [None]:
cleaned_dataframe.loc[cleaned_dataframe["Flag_Orgind"] != "IND"]

Unnamed: 0_level_0,Flag_Orgind,First_Name,City,State,Zip,Occupation,Amount,Date,Aggregate_Amount,Cycle,Campaign,Campaign_Type
Record_id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1


### Confirm Addition of Campaign Types

In [None]:
cleaned_dataframe["Campaign_Type"].unique()

array(['Democratic', 'Republican'], dtype=object)

### Extract Sample From Final Dataframe and Export to CSV

In [None]:
new_sample_df = cleaned_dataframe.sample(frac = .15)
new_sample_df.to_csv("new_sample_df.csv")