In [16]:
import pandas as pd

district_data = pd.read_csv('cleaned_district_data.csv')
county_data = pd.read_csv('cleaned_county_data.csv')



In [17]:
# merging county and district datasets



# convert FIPS codes to state abbreviations
fips_to_abbrev = {1:'AL', 2:'AK', 4:'AZ', 6:'CA', 8:'CO', 9:'CT', 10:'DE', 11:'DC', 
                  12:'FL', 13:'GA', 15:'HI', 16:'ID', 17:'IL', 18:'IN', 19:'IA', 20:'KS',
                  21:'KY', 22:'LA', 23:'ME', 24:'MD', 25:'MA', 26:'MI', 27:'MN', 28:'MS',
                  29:'MO', 30:'MT', 31:'NE', 32:'NV', 33:'NH', 34:'NJ', 35:'NM', 36:'NY',
                  37:'NC', 38:'ND', 39:'OH', 40:'OK', 41:'OR', 42:'PA', 72:'PR', 44:'RI',
                  45:'SC', 46:'SD', 47:'TN', 48:'TX', 49:'UT', 50:'VT', 51:'VA', 78:'VI',
                  53:'WA', 54:'WV', 55:'WI', 56:'WY'}

county_data['state'] = county_data['state'].map(fips_to_abbrev)


district_data['district'] = district_data['district'].str.extract('(\d+)').astype(int)

# drops rows with odd years
county_data = county_data.drop( county_data[county_data['year'] % 2 != 0].index, axis=0)

# merges data based on state, district, and year
county_data = county_data.merge(district_data, on = ['state', 'district','year'], how = 'left')
#print(county_data)
# confirms that only the state category has NaN values 
county_data['state'].isnull().unique() # = [False, True]
county_data['district'].isnull().unique() # = [False]
county_data['year'].isnull().unique() # [False]

# drops state/district/year row if either state, district, or year contains an NaN value

NaN_removed = county_data[['state', 'district', 'year']].dropna() #.apply(tuple, axis=1)

# drops observations from district that did not match county by ensuring that the state/district/year combinations of district_data match dropped NaN 

district_data = district_data[ district_data[['state', 'district', 'year']].apply(tuple, axis=1).isin(NaN_removed)]


# identifies where county data did not match district data (any NaN entry) and applies single incumbent label
NaN_rows = county_data[ county_data[['state', 'district', 'year']].isnull().any(axis=1)]

# applies 0,1,2.. indexing to county_data to easily identify the NaN rows
county_data = county_data.reset_index(drop = True)


county_data.loc[NaN_rows.index, 'one_incumb'] = True
county_data.loc[NaN_rows.index, 'zero_incumbs'] = False
county_data.loc[NaN_rows.index, 'multiple_incumbs'] = False

# drops unamed rows that were the index columns
county_data = county_data.drop(['Unnamed: 0_x', 'Unnamed: 0_y'], axis = 1)

print(county_data['zero_incumbs'].unique())
print(county_data['one_incumb'].unique())
print(county_data['multiple_incumbs'].unique())

county_data.to_csv('/Users/divya/Documents/ra_data_task/data/merged_county_district_data.csv')


county_data


                  

  district_data['district'] = district_data['district'].str.extract('(\d+)').astype(int)


[True False nan]
[False True nan]
[False True nan]


Unnamed: 0,congress,year,state,county,district,cd_share_of_county,single_district_county,population,white,black,...,share_other_race,share_two_or_more,share_hispanic,share_white_not_hispanic,share_male,share_female,ico.status,zero_incumbs,one_incumb,multiple_incumbs
0,117,2022,AL,1001,2,1.0,1,58805,42160,11445,...,0.015475,0.194626,0.03600,0.707117,0.482782,0.517218,C,True,False,False
1,117,2022,AL,1001,2,1.0,1,58805,42160,11445,...,0.015475,0.194626,0.03600,0.707117,0.482782,0.517218,I,False,True,False
2,116,2020,AL,1001,2,1.0,1,58805,42160,11445,...,0.015475,0.194626,0.03600,0.707117,0.482782,0.517218,C,True,False,False
3,116,2020,AL,1001,2,1.0,1,58805,42160,11445,...,0.015475,0.194626,0.03600,0.707117,0.482782,0.517218,I,False,True,False
4,116,2020,AL,1001,2,1.0,1,58805,42160,11445,...,0.015475,0.194626,0.03600,0.707117,0.482782,0.517218,O,True,False,False
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
62031,109,2006,WY,56045,1,1.0,1,6644,6374,8,...,0.009332,0.001204,0.02062,0.947923,0.507676,0.492324,I,False,True,False
62032,108,2004,WY,56045,1,1.0,1,6644,6374,8,...,0.009332,0.001204,0.02062,0.947923,0.507676,0.492324,C,True,False,False
62033,108,2004,WY,56045,1,1.0,1,6644,6374,8,...,0.009332,0.001204,0.02062,0.947923,0.507676,0.492324,I,False,True,False
62034,107,2002,WY,56045,1,1.0,1,6644,6374,8,...,0.009332,0.001204,0.02062,0.947923,0.507676,0.492324,C,True,False,False
