# Create Single DataFrame with all County Data

In [81]:
import pandas as pd
import glob
import re
from pathlib import Path

In [82]:
def grab_county_file_paths():
    p = Path("countyCSV_March7")
    paths = list(p.glob("*.csv"))
    return paths


In [83]:
county_file_paths = grab_county_file_paths()

In [84]:
# county_file_paths

In [85]:
def process_csvs(paths):
    valid_dfs = []
    invalid_dfs = []
    
    for path in paths:
        df = pd.read_csv(path)
        
        county = path.stem
        
        columns = df.columns
        if "0" in columns or "1" in columns:
            header = df.iloc[0]
            df = df.rename(header, axis="columns")
            df = df.iloc[1:]
        columns = df.columns
        
        if "Virginia" in columns[0] or "Va" in columns[0] and "Virginia" not in columns[1]:
            df = df.rename({columns[0]: "code_va", columns[1]: "local_code"}, axis=1)
            df["locality"] = county
            df = df.set_index("code_va")
            valid_dfs.append(df)
        else: 
            invalid_dfs.append(path)

    return valid_dfs, invalid_dfs
    

In [86]:
valid_dfs, invalid_dfs = process_csvs(county_file_paths)

In [87]:
len(valid_dfs)

84

In [88]:
len(invalid_dfs)

1

In [89]:
for p in invalid_dfs:
    df = pd.read_csv(p)
    print(df.head())

  Code of  Virgina  Section  salem
0              1-10 et seq.  Ch. 1
1                    1-13.3    1-2
2                  1-13.3:1    1-2
3            1-13.6, 1-13.7    1-2
4                    1-13.9    1-6


In [90]:
for df in valid_dfs[:3]:
    print(df.head())

                  local_code             locality
code_va                                          
1-3.9                    1-3  spotsylvania_county
1-13.3 et seq.           1-2  spotsylvania_county
2.1-116.1             16-147  spotsylvania_county
2.1-116.1 et seq.     16-147  spotsylvania_county
2.1-340 et seq.     2-107(b)  spotsylvania_county
              local_code      locality
code_va                               
1-1                  1-2  falls_church
1-21                 1-2  falls_church
1-200 et seq.        1-2  falls_church
1-210                1-2  falls_church
1-216                1-2  falls_church
        local_code  locality
code_va                     
1-210          1-2  richmond
1-216          1-2  richmond
1-217          1-8  richmond
1-222          1-2  richmond
1-223          1-2  richmond


### Concat Implementation

In [91]:
# merged data
big_df = pd.concat(valid_dfs, join="outer")
big_df

Unnamed: 0_level_0,local_code,locality
code_va,Unnamed: 1_level_1,Unnamed: 2_level_1
1-3.9,1-3,spotsylvania_county
1-13.3 et seq.,1-2,spotsylvania_county
2.1-116.1,16-147,spotsylvania_county
2.1-116.1 et seq.,16-147,spotsylvania_county
2.1-340 et seq.,2-107(b),spotsylvania_county
...,...,...
"59.1-142, 59.1-143 10-7","59.1-142, 59.1-143 10-7",colonial_beach
59.1-144,10-7,colonial_beach
,10-8,colonial_beach
59.1-145—59.1-148 10-7,59.1-145—59.1-148 10-7,colonial_beach


## Clean Data
### TODO - up next

In [92]:
big_df.groupby("code_va").get_group("1-1")

Unnamed: 0_level_0,local_code,locality
code_va,Unnamed: 1_level_1,Unnamed: 2_level_1
1-1,1-2,falls_church
1-1,1-2,pulaski
1-1,1-2,clifton_forge
1-1,1-2,christiansburg
