# Create Single DataFrame with all County Data

In [93]:
import pandas as pd
import glob
import re

In [94]:
def grab_county_file_paths():
    paths = []
    dir = "countyCSV"
    for path in glob.glob(f"{dir}/*"):
        paths.append(path)
    return paths


In [95]:
county_file_paths = grab_county_file_paths()

In [96]:
#county_file_paths

In [97]:
def process_csvs(paths):
    FILE_NAME_PARSE = re.compile(r"(\w{9})/(.*).csv")
    
    valid_dfs = []
    invalid_dfs = []
    
    for path in paths:
        df = pd.read_csv(path)
        columns = df.columns
        
        if "0" in columns or "1" in columns:
            header = df.iloc[0]
            df = df.rename(header, axis="columns")
            df = df.iloc[1:]
        columns = df.columns
        
        if len(columns) == 2:
            if "Virginia" in columns[0] and "Virginia" not in columns[1]:
                match = FILE_NAME_PARSE.match(path)
                county = match.group(2)
                
                county = county.replace(" ", "_")
                df = df.rename({columns[0]: "code_va", columns[1]: "local_code"}, axis=1)
                df["locality"] = county
                df = df.set_index("code_va")
                valid_dfs.append(df)
#                 print(f"Path, {path}, has the following columns, {df.columns}")
            else: 
                invalid_dfs.append(path)
                    
        else:
            invalid_dfs.append(path)
            
    return valid_dfs, invalid_dfs
    

In [98]:
valid_dfs, invalid_dfs = process_csvs(county_file_paths)

In [99]:
len(valid_dfs)

120

In [100]:
for df in valid_dfs[:3]:
    print(df.head())

              local_code      locality
code_va                               
1-1                  1-2  falls_church
1-21                 1-2  falls_church
1-200 et seq.        1-2  falls_church
1-210                1-2  falls_church
1-216                1-2  falls_church
        local_code  locality
code_va                     
1-210          1-2  Richmond
1-216          1-2  Richmond
1-217          1-8  Richmond
1-222          1-2  Richmond
1-223          1-2  Richmond
            local_code              locality
code_va                                     
1-1 et seq.        1-2  prince_george_county
1-13.9             1-3  prince_george_county
1-13.32            1-2  prince_george_county
1-17.1             1-7  prince_george_county
2.2-614.1        2-731  prince_george_county


### Concat Implementation

In [101]:
valid_dfs[0]

Unnamed: 0_level_0,local_code,locality
code_va,Unnamed: 1_level_1,Unnamed: 2_level_1
1-1,1-2,falls_church
1-21,1-2,falls_church
1-200 et seq.,1-2,falls_church
1-210,1-2,falls_church
1-216,1-2,falls_church
...,...,...
63.2-802,22-53,falls_church
63.2-1515,8-692,falls_church
63.2-1701,8-678,falls_church
63.2-1715—63.2-1717,8-678,falls_church


In [107]:
# merged data
big_df = pd.concat(valid_dfs, join="outer")
big_df

Unnamed: 0_level_0,local_code,locality
code_va,Unnamed: 1_level_1,Unnamed: 2_level_1
1-1,1-2,falls_church
1-21,1-2,falls_church
1-200 et seq.,1-2,falls_church
1-210,1-2,falls_church
1-216,1-2,falls_church
...,...,...
58.1-3993,14-49,haymarket_2
59.1-142 et seq.,30-4,haymarket_2
59.1-296,58-1,haymarket_2
62.1-44.2 et seq.,Ch. 50,haymarket_2


## Merge Implementation

In [13]:
def merge_dfs(single_df, df):
    return pd.merge(single_df, df, on="code_va")

In [14]:
big_df = pd.merge(valid_dfs[0], valid_dfs[1], on="code_va")
big_df

Unnamed: 0_level_0,falls_church,Richmond
code_va,Unnamed: 1_level_1,Unnamed: 2_level_1
1-210,1-2,1-2
1-216,1-2,1-2
1-217,1-5,1-8
1-223,1-2,1-2
1-227,1-2,1-2
...,...,...
62.1-44.15:24 et seq.,"35-1, 35-2",14-149
62.1-44.15:51 et seq.,"35-1, 35-2","Ch. 14, Art. III"
62.1-44.15:52,35-14,14-149
62.1-44.15:67 et seq.,Ch. 35,"Ch. 14, Art. IV"


In [15]:
pd.merge(big_df, valid_dfs[2], on="code_va")

Unnamed: 0_level_0,falls_church,Richmond,prince_george_county
code_va,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
3.2-6503,4-75,4-241,6-149
3.2-6538,"4-40, 4-41",4-243,6-146
9.1-101,28-202,3-73,6-3
10.1-1400 et seq.,Ch. 34,Ch. 23 (note),Ch. 66
15.2-901,33-10,"Ch. 11, Art. IV",54-27
...,...,...,...
58.1-3800 et seq.,Ch. 40,"Ch. 26, Art. XI","Ch. 74, Art. IV"
58.1-3814,40-296—40-299,"Ch. 26, Art. VII",74-153
58.1-3916,40-283,26-331,14-33
58.1-3986,40-459,26-870,14-34


In [106]:
big_df = pd.merge(valid_dfs[0], valid_dfs[1], on="code_va")
for i, df in enumerate(valid_dfs[2:]): 
    big_df = pd.merge(big_df, df, how="outer")
    print(f"processed {i} dataframes...")
    
