In [1]:
import csv
import pandas as pd

In [26]:
# Load the dictionary
dic = {
    "Format=Year of diagnosis": {
        "0": "2000-2020",
        "1": "2000",
        "2": "2001",
        "3": "2002",
        "4": "2003",
        "5": "2004",
        "6": "2005",
        "7": "2006",
        "8": "2007",
        "9": "2008",
        "10": "2009",
        "11": "2010",
        "12": "2011",
        "13": "2012",
        "14": "2013",
        "15": "2014",
        "16": "2015",
        "17": "2016",
        "18": "2017",
        "19": "2018",
        "20": "2019",
        "21": "2020"
    },
    "Format=Rural-Urban Continuum Code": {
        "0": "Metropolitan Counties",
        "1": "Counties in metropolitan areas ge 1 million pop",
        "2": "Counties in metropolitan areas of 250,000 to 1 million pop",
        "3": "Counties in metropolitan areas of lt 250 thousand pop",
        "4": "Nonmetropolitan Counties",
        "5": "Nonmetropolitan counties adjacent to a metropolitan area",
        "6": "Nonmetropolitan counties not adjacent to a metropolitan area",
        "7": "Unknown/missing/no match (Alaska or Hawaii - Entire State)",
        "8": "Unknown/missing/no match/Not 1990-2021"
    }
}

# Function to convert values
def convert_value(header, value):
    if header == "Year of diagnosis":
        return dic["Format=Year of diagnosis"].get(value, value)
    # elif header == "Rural-Urban Continuum Code":
    #     return dic["Format=Rural-Urban Continuum Code"].get(value, value)
    else:
        return value

# Read CSV file and convert values
input_file = "seer_diagnosisYear_rucc_040124.csv"
output_file = "seer_diagnosisYear_rucc_yearConverted.csv"

with open(input_file, 'r') as csv_input, open(output_file, 'w', newline='') as csv_output:
    reader = csv.DictReader(csv_input)
    writer = csv.DictWriter(csv_output, fieldnames=reader.fieldnames)
    writer.writeheader()

    for row in reader:
        converted_row = {header: convert_value(header, value) for header, value in row.items()}
        writer.writerow(converted_row)

print("Conversion complete. Converted file saved as:", output_file)


Conversion complete. Converted file saved as: seer_diagnosisYear_rucc_yearConverted.csv


In [4]:
# Read the CSV file
csv_file_path = "Ruralurbancontinuumcodes2023.csv"
data = pd.read_csv(csv_file_path)

# Group the data by State and find the most common RUCC_2023 value for each state
most_common_rucc = data.groupby('State')['RUCC_2023'].agg(pd.Series.mode).reset_index()

print(most_common_rucc)

# most_common_rucc.to_csv("Ruralurbancontinuumcodes2023_mode.csv", index=False)

   State        RUCC_2023
0     AK              9.0
1     AL              8.0
2     AR              9.0
3     AS  [5.0, 7.0, 9.0]
4     AZ              3.0
5     CA              1.0
6     CO              9.0
7     CT              2.0
8     DC              1.0
9     DE  [1.0, 3.0, 4.0]
10    FL              2.0
11    GA              8.0
12    GU              5.0
13    HI       [3.0, 5.0]
14    IA              9.0
15    ID              8.0
16    IL              1.0
17    IN              6.0
18    KS              9.0
19    KY       [8.0, 9.0]
20    LA              2.0
21    MA              1.0
22    MD              1.0
23    ME              8.0
24    MI              9.0
25    MN              9.0
26    MO              9.0
27    MP              9.0
28    MS       [8.0, 9.0]
29    MT              9.0
30    NC              2.0
31    ND              9.0
32    NE              9.0
33    NH              4.0
34    NJ              1.0
35    NM              6.0
36    NV              9.0
37    NY    

In [5]:
# Define a custom aggregation function to return the first mode value
def first_mode(x):
    modes = x.mode()
    if len(modes) > 0:
        return modes.iloc[0]  # Return the first mode value
    else:
        return None

# Group the data by State and find the most common RUCC_2023 value for each state
most_common_rucc = data.groupby('State')['RUCC_2023'].agg(first_mode).reset_index()

most_common_rucc.to_csv("Ruralurbancontinuumcodes2023_firstModeValue.csv", index=False)

In [6]:
most_common_rucc

Unnamed: 0,State,RUCC_2023
0,AK,9.0
1,AL,8.0
2,AR,9.0
3,AS,5.0
4,AZ,3.0
5,CA,1.0
6,CO,9.0
7,CT,2.0
8,DC,1.0
9,DE,1.0


In [15]:
csv_file_path = "seer_diagnosisYear_rucc_yearConverted.csv"
data2 = pd.read_csv(csv_file_path)
data2

Unnamed: 0,Year of diagnosis,Rural-Urban Continuum Code,Count
0,2000-2020,0,7175018
1,2000-2020,1,4795718
2,2000-2020,2,1704535
3,2000-2020,3,674765
4,2000-2020,4,1030536
...,...,...,...
193,2020,4,49155
194,2020,5,28561
195,2020,6,20594
196,2020,7,496


In [17]:
def add_one(x):
    return x + 1

data2_copy = data2.copy()

# Apply the function to the column
data2_copy['Rural-Urban Continuum Code'] = data2_copy['Rural-Urban Continuum Code'].apply(add_one)

# Find the indices of rows where 'Year of diagnosis' is '2000-2020'
# Drop rows with the specified indices
data2_copy = data2_copy.drop(data2_copy[data2_copy['Year of diagnosis'] == '2000-2020'].index)

data2_copy['Year of diagnosis'] = data2_copy['Year of diagnosis'].astype(int)

# Filter rows with 'Year of diagnosis' between 2010 and 2020
data2_copy = data2_copy[(data2_copy['Year of diagnosis'] >= 2010) & (data2_copy['Year of diagnosis'] <= 2020)]

print(data2_copy)

data2_copy.to_csv("seer_diagnosisYear_rucc_2010to2020.csv", index=False)

     Year of diagnosis  Rural-Urban Continuum Code   Count
99                2010                           1  345340
100               2010                           2  230590
101               2010                           3   84343
102               2010                           4   30407
103               2010                           5   48152
..                 ...                         ...     ...
193               2020                           5   49155
194               2020                           6   28561
195               2020                           7   20594
196               2020                           8     496
197               2020                           9      66

[99 rows x 3 columns]
