In [106]:
# Import libraries etc
import pandas as pd
import re
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import LinearRegression
from sklearn.metrics import mean_squared_error, r2_score
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline

In [118]:
# Load the dataset
df = pd.read_csv('data/all_cancer_and_pesticides.csv')
df

Unnamed: 0,County,FIPS,Cancer Incidence Rate Per 100k,Breast Cancer Incidence Rate Per 100k,Leukemia Incidence Rate Per 100k,Lung Cancer Incidence Rate Per 100k,Non-Hodgkin Lymphoma Incidence Rate Per 100k,Prostate Cancer Incidence Rate Per 100k,Insecticide Sum,Herbicide Sum,Fungicide Sum
0,"Union County, Florida(6)",12125,1237.4,250.6,56.5,182.9,48.5,295.7,253.1,7146.6,241.5
1,"Palo Alto County, Iowa(7)",19147,658.1,239.7,33.6,141.9,47.9,258.5,5897.5,269873.7,4271.9
2,"Treasure County, Montana(6)",30103,652.2,213.0,29.2,132.3,41.7,229.1,839.8,76881.9,488.3
3,"Polk County, Texas(7)",48373,633.6,209.5,27.7,131.5,40.2,226.7,1.2,1694.6,1.6
4,"Floyd County, Kentucky(7)",21071,616.8,204.6,27.7,127.7,36.7,225.7,4.2,812.9,3.8
...,...,...,...,...,...,...,...,...,...,...,...
1592,"Hickman County, Kentucky(7)",21105,436.4,113.0,7.3,54.6,14.1,95.9,1753.6,245496.5,5383.3
1593,"Butler County, Nebraska(6)",31023,436.3,113.0,6.9,54.5,14.1,95.8,2697.3,370666.0,4818.5
1594,"Westmoreland County, Pennsylvania(6)",42129,436.3,112.9,6.4,54.5,14.1,95.8,1796.6,28806.5,839.8
1595,"New York County, New York(7)",36061,436.2,112.9,6.3,54.5,14.1,95.8,0.0,0.0,0.0


In [119]:
# Remove the row for Union County, Florida
df = df[df['County'] != 'Union County, Florida(6)']
df

Unnamed: 0,County,FIPS,Cancer Incidence Rate Per 100k,Breast Cancer Incidence Rate Per 100k,Leukemia Incidence Rate Per 100k,Lung Cancer Incidence Rate Per 100k,Non-Hodgkin Lymphoma Incidence Rate Per 100k,Prostate Cancer Incidence Rate Per 100k,Insecticide Sum,Herbicide Sum,Fungicide Sum
1,"Palo Alto County, Iowa(7)",19147,658.1,239.7,33.6,141.9,47.9,258.5,5897.5,269873.7,4271.9
2,"Treasure County, Montana(6)",30103,652.2,213.0,29.2,132.3,41.7,229.1,839.8,76881.9,488.3
3,"Polk County, Texas(7)",48373,633.6,209.5,27.7,131.5,40.2,226.7,1.2,1694.6,1.6
4,"Floyd County, Kentucky(7)",21071,616.8,204.6,27.7,127.7,36.7,225.7,4.2,812.9,3.8
5,"Logan County, Nebraska(6)",31113,609.8,198.3,27.2,126.4,34.7,219.4,604.5,39672.1,420.4
...,...,...,...,...,...,...,...,...,...,...,...
1592,"Hickman County, Kentucky(7)",21105,436.4,113.0,7.3,54.6,14.1,95.9,1753.6,245496.5,5383.3
1593,"Butler County, Nebraska(6)",31023,436.3,113.0,6.9,54.5,14.1,95.8,2697.3,370666.0,4818.5
1594,"Westmoreland County, Pennsylvania(6)",42129,436.3,112.9,6.4,54.5,14.1,95.8,1796.6,28806.5,839.8
1595,"New York County, New York(7)",36061,436.2,112.9,6.3,54.5,14.1,95.8,0.0,0.0,0.0


In [109]:
# Clean and preprocess the data to remove the number from county
df.loc[:, 'County'] = df['County'].apply(lambda x: re.sub(r'\(\d+\)$', '', x.strip()))  
df


Unnamed: 0,County,FIPS,Cancer Incidence Rate Per 100k,Breast Cancer Incidence Rate Per 100k,Leukemia Incidence Rate Per 100k,Lung Cancer Incidence Rate Per 100k,Non-Hodgkin Lymphoma Incidence Rate Per 100k,Prostate Cancer Incidence Rate Per 100k,Insecticide Sum,Herbicide Sum,Fungicide Sum
1,"Palo Alto County, Iowa",19147,658.1,239.7,33.6,141.9,47.9,258.5,5897.5,269873.7,4271.9
2,"Treasure County, Montana",30103,652.2,213.0,29.2,132.3,41.7,229.1,839.8,76881.9,488.3
3,"Polk County, Texas",48373,633.6,209.5,27.7,131.5,40.2,226.7,1.2,1694.6,1.6
4,"Floyd County, Kentucky",21071,616.8,204.6,27.7,127.7,36.7,225.7,4.2,812.9,3.8
5,"Logan County, Nebraska",31113,609.8,198.3,27.2,126.4,34.7,219.4,604.5,39672.1,420.4
...,...,...,...,...,...,...,...,...,...,...,...
1592,"Hickman County, Kentucky",21105,436.4,113.0,7.3,54.6,14.1,95.9,1753.6,245496.5,5383.3
1593,"Butler County, Nebraska",31023,436.3,113.0,6.9,54.5,14.1,95.8,2697.3,370666.0,4818.5
1594,"Westmoreland County, Pennsylvania",42129,436.3,112.9,6.4,54.5,14.1,95.8,1796.6,28806.5,839.8
1595,"New York County, New York",36061,436.2,112.9,6.3,54.5,14.1,95.8,0.0,0.0,0.0


In [120]:
# Separate into X and y
X = df.drop(['FIPS', 'Cancer Incidence Rate Per 100k'], axis=1)
y = df['Cancer Incidence Rate Per 100k']

In [121]:
# Define categorical columns 
categorical_cols = ['County']

# Define preprocessing steps for numerical and categorical data
numerical_transformer = StandardScaler()
categorical_transformer = OneHotEncoder()

preprocessor = ColumnTransformer(
    transformers=[
        ('num', numerical_transformer, ~X.columns.isin(categorical_cols)),
        ('cat', categorical_transformer, categorical_cols)
    ],
    remainder='passthrough'
)

In [112]:
# Create a pipeline with preprocessing and modeling steps
pipeline = Pipeline(steps=[('preprocessor', preprocessor),
                           ('regressor', LinearRegression())])


In [113]:
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)

# Fit the pipeline on the entire dataset (training and testing combined)
pipeline.fit(X, y)


In [114]:
# Make predictions on the testing set
y_pred = pipeline.predict(X_test)

In [115]:
# Evaluate the model
mse = mean_squared_error(y_test, y_pred)
r2 = r2_score(y_test, y_pred)

# Print the model evaluation metrics
print(f"Mean Squared Error: {mse}")
print(f"R-squared: {r2}")

Mean Squared Error: 1.327123912564297e-12
R-squared: 0.9999999999999986


In [116]:
# Save performance metrics to a CSV file for documentation and comparison
metrics_df = pd.DataFrame({'Metric': ['Mean Squared Error', 'R-squared'],
                           'Value': [mse, r2]})
metrics_df.to_csv('model_performance_metrics.csv', index=False)