# Create Single DataFrame with all County Data

In [88]:
import pandas as pd
import glob
import re

In [89]:
def grab_county_file_paths():
    paths = []
    dir = "countyCSV"
    for path in glob.glob(f"{dir}/*"):
        paths.append(path)
    return paths


In [90]:
county_file_paths = grab_county_file_paths()

In [109]:
def process_csvs(paths):
    FILE_NAME_PARSE = re.compile(r"(\w{9})/(.*).csv")
    
    valid_dfs = []
    invalid_dfs = []
    
    for path in paths:
        df = pd.read_csv(path)
        columns = df.columns
        
        if "0" in columns or "1" in columns:
            header = df.iloc[0]
            df = df.rename(header, axis="columns")
            df = df.iloc[1:]
        columns = df.columns
        
        if len(columns) == 2:
            if "Virginia" in columns[0] and "Virginia" not in columns[1]:
                match = FILE_NAME_PARSE.match(path)
                county = match.group(2)
                
                county = county.replace(" ", "_")
                df = df.rename({columns[0]: "code_va", columns[1]: county}, axis=1)
                df = df.set_index("code_va")
                valid_dfs.append(df)
#                 print(f"Path, {path}, has the following columns, {df.columns}")
            else: 
                invalid_dfs.append(path)
                    
        else:
            invalid_dfs.append(path)
            
    return valid_dfs, invalid_dfs
    

In [110]:
valid_dfs, invalid_dfs = process_csvs(county_file_paths)

In [111]:
len(valid_dfs)

120

In [116]:
for df in valid_dfs[:3]:
    print(df.head())

              falls_church
code_va                   
1-1                    1-2
1-21                   1-2
1-200 et seq.          1-2
1-210                  1-2
1-216                  1-2
        Richmond
code_va         
1-210        1-2
1-216        1-2
1-217        1-8
1-222        1-2
1-223        1-2
            prince_george_county
code_va                         
1-1 et seq.                  1-2
1-13.9                       1-3
1-13.32                      1-2
1-17.1                       1-7
2.2-614.1                  2-731


### Concat Implementation

In [112]:
big_df = pd.concat(valid_dfs, join="outer")
big_df

Unnamed: 0_level_0,falls_church,Richmond,prince_george_county,quantico,fairfax,madison_county,strasburg_2,covington,hamilton_2,bluefield,...,south_hill,west_point,new_kent_county,halifax_county_2,greene_county,new_market,washington_county,manassas,colonial_beach,haymarket_2
code_va,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1,Unnamed: 9_level_1,Unnamed: 10_level_1,Unnamed: 11_level_1,Unnamed: 12_level_1,Unnamed: 13_level_1,Unnamed: 14_level_1,Unnamed: 15_level_1,Unnamed: 16_level_1,Unnamed: 17_level_1,Unnamed: 18_level_1,Unnamed: 19_level_1,Unnamed: 20_level_1,Unnamed: 21_level_1
1-1,1-2,,,,,,,,,,...,,,,,,,,,,
1-21,1-2,,,,,,,,,,...,,,,,,,,,,
1-200 et seq.,1-2,,,,,,,,,,...,,,,,,,,,,
1-210,1-2,,,,,,,,,,...,,,,,,,,,,
1-216,1-2,,,,,,,,,,...,,,,,,,,,,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
58.1-3993,,,,,,,,,,,...,,,,,,,,,,14-49
59.1-142 et seq.,,,,,,,,,,,...,,,,,,,,,,30-4
59.1-296,,,,,,,,,,,...,,,,,,,,,,58-1
62.1-44.2 et seq.,,,,,,,,,,,...,,,,,,,,,,Ch. 50


## Merge Implementation

In [46]:
def merge_dfs(single_df, df):
    return pd.merge(single_df, df, on="code_va")

In [47]:
big_df = pd.merge(valid_dfs[0], valid_dfs[1], on="code_va")
big_df

Unnamed: 0,code_va,falls_church,Richmond
0,1-210,1-2,1-2
1,1-216,1-2,1-2
2,1-217,1-5,1-8
3,1-223,1-2,1-2
4,1-227,1-2,1-2
...,...,...,...
181,62.1-44.15:24 et seq.,"35-1, 35-2",14-149
182,62.1-44.15:51 et seq.,"35-1, 35-2","Ch. 14, Art. III"
183,62.1-44.15:52,35-14,14-149
184,62.1-44.15:67 et seq.,Ch. 35,"Ch. 14, Art. IV"


In [48]:
pd.merge(big_df, valid_dfs[2], on="code_va")

Unnamed: 0,code_va,falls_church,Richmond,prince_george_county
0,3.2-6503,4-75,4-241,6-149
1,3.2-6538,"4-40, 4-41",4-243,6-146
2,9.1-101,28-202,3-73,6-3
3,10.1-1400 et seq.,Ch. 34,Ch. 23 (note),Ch. 66
4,15.2-901,33-10,"Ch. 11, Art. IV",54-27
...,...,...,...,...
57,58.1-3800 et seq.,Ch. 40,"Ch. 26, Art. XI","Ch. 74, Art. IV"
58,58.1-3814,40-296—40-299,"Ch. 26, Art. VII",74-153
59,58.1-3916,40-283,26-331,14-33
60,58.1-3986,40-459,26-870,14-34


In [43]:
big_df = pd.merge(valid_dfs[0], valid_dfs[1], on="code_va")
for i, df in enumerate(valid_dfs[2:]): 
    big_df = pd.merge(big_df, df, how="outer")
    print(f"processed {i} dataframes...")
    


processed 0 dataframes...
processed 1 dataframes...
processed 2 dataframes...
processed 3 dataframes...
processed 4 dataframes...


KeyboardInterrupt: 

In [None]:
big_df