# Project 1

Data from: Heyes, Anthony, and Soodeh Saberian. 2019. "Temperature and Decisions: Evidence from 207,000 Court Cases." American Economic Journal: Applied Economics, 11 (2): 238–65.

Notebooks used troughout the code: 
- ISLP-Ch06_varselect_lab.ipynb
- ISLP-TreeModels.ipynb
- CIDP-Chapter_04
- CIDP-Chapter_05
- CIDP-Chapter_07

In [None]:
import pandas as pd
import numpy as np
import statsmodels.api as sm
import sklearn.linear_model as skl
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import StandardScaler
import sklearn.model_selection as skm
from sklearn.model_selection import GridSearchCV, KFold
from matplotlib.pyplot import subplots
from statsmodels.discrete.discrete_model import Probit
import statsmodels.api as sm
from sklearn.linear_model import RidgeClassifier
from sklearn.linear_model import SGDClassifier

In [None]:
#For Tree
from sklearn.tree import (DecisionTreeClassifier as DTC,
                          DecisionTreeRegressor as DTR,
                          plot_tree,
                          export_text)
from sklearn.metrics import (accuracy_score,
                             log_loss)
from sklearn.ensemble import \
     (RandomForestClassifier as RFC,
      GradientBoostingClassifier as GBC)
from sklearn.metrics import confusion_matrix

In [None]:
#For Graphs
from scipy import stats

import matplotlib.pyplot as plt
plt.style.use('fivethirtyeight')

import graphviz
import networkx as nx
COLORS = [
    '#00B0F0',
    '#FF0000'
]
import matplotlib.patches as patches

In [None]:
#DAG
!pip install dowhy
import dowhy
from dowhy import CausalModel

## Data Description 

In [None]:
#Read the dataset from the replication package
df = pd.read_stata('matched_corrected.dta')
df.describe()

In [None]:
# Print the list of columns to identify all variables

columns_list = df.columns.tolist()
print(columns_list)

In [None]:
#Create a dummy for asylum
df['dummy_asylum'] = df['c_asy_type'].apply(lambda x: 1 if x == 'E' else 0)
#Create a dummy for gender
df['dummy_gender'] = df['gender'].apply(lambda x: 1 if x == 'female' else 0)

In [None]:
#As outlined in the correction article drop the observation for China
df = df[df['nat_name'] != 'CHINA']

In [None]:
# Get unique values to identify variables for the dummy variables
unique__names = df['nat_name'].unique()
locations = df['location'].unique()

#Create a categorical variable for nationatility

# Define the list of regions
middle_eastern_countries = ["BAHRAIN", "CYPRUS", "EGYPT", "IRAN", "IRAQ", "ISRAEL", "JORDAN", 
    "KUWAIT", "LEBANON", "OMAN", "PALESTINE", "QATAR", "SAUDI ARABIA", 
    "SYRIA", "TURKEY", "UNITED ARAB EMIRATES", "YEMEN"]

africa = ["ERITREA", "RWANDA", "SOMALIA", "SUDAN", "CONGO", "ETHIOPIA", "LIBYA", 
    "MALI", "ANGOLA", "BURUNDI", "TANZANIA", "NIGERIA", "GABON", "GHANA", 
    "SENEGAL", "CHAD", "DJIBOUTI", "CAMEROON", "UGANDA", "KENYA", 
    "ZAMBIA", "MAURITANIA", "SOUTH AFRICA", "GUINEA", "BURKINA FASO", 
    "MOROCCO", "ALGERIA", "COMORO ISLANDS", "EQUATORIAL GUINEA", 
    "CENTRAL AFRICAN REPUBLIC", "CAPE VERDE", "LESOTHO", "SWAZILAND", 
    "GAMBIA", "SIERRA LEONE", "GUINEA BISSAU"]

america = ["GUATEMALA", "EL SALVADOR", "PANAMA", "COLOMBIA", 
    "ARGENTINA", "HAITI", "VENEZUELA", "MEXICO", "CUBA", "DOMINICAN REPUBLIC", 
    "BRAZIL", "CHILE", "SURINAME", "TRINIDAD AND TOBAGO", "JAMAICA", 
    "CANADA", "USA", "ST. KITTS, WEST INDIES", "ANTIGUA AND BARBUDA", 
    "BARBADOS", "BAHAMAS", "BELIZE", "DOMINICA", "GRENADA", 
    "NICARAGUA", "URUGUAY", "PARAGUAY", "ST. LUCIA", "ST. VINCENT AND THE GRENADINES"]

asia = ["PAKISTAN", "VIETNAM", "INDONESIA", "AFGHANISTAN", 
    "IRAN", "BANGLADESH", "PHILIPPINES", "TAIWAN", "MALAYSIA", 
    "KAZAKHSTAN", "KYRGYZSTAN", "THAILAND", "TURKMENISTAN", "UZBEKISTAN", 
    "MONGOLIA", "SRI LANKA", "BHUTAN", "LAOS", "NEPAL", 
    "MYANMAR", "KAMPUCHEA", "BRUNEI", "BURMA", "KOREA", "NORTH KOREA"]

europe = ["RUSSIA", "ARMENIA", "ALBANIA", "YUGOSLAVIA", "UNITED KINGDOM", 
    "BULGARIA", "ROMANIA", "HUNGARY", "POLAND", "CZECH REPUBLIC", 
    "SLOVAK REPUBLIC", "GERMANY", "FRANCE", "ITALY", "SPAIN", 
    "SWEDEN", "DENMARK", "FINLAND", "AUSTRIA", "SWITZERLAND", 
    "BELGIUM", "GREECE", "NETHERLANDS", "CROATIA", "SLOVENIA", 
    "MONACO", "LITHUANIA", "LATVIA", "ESTONIA", "ICELAND"]

# Create the regional variable and set it to 0 by default
df['middleast'] = 0
df['america'] = 0
df['africa'] = 0
df['asia'] = 0
df['europe'] = 0

# Replace with 1 for observations where nationality is in the list of selected regions
df.loc[df['nat_name'].isin(middle_eastern_countries), 'middleast'] = 1
df.loc[df['nat_name'].isin(america), 'america'] = 1
df.loc[df['nat_name'].isin(africa), 'africa'] = 1
df.loc[df['nat_name'].isin(asia), 'asia'] = 1
df.loc[df['nat_name'].isin(europe), 'europe'] = 1

#Create interaction terms
df['middleast_dev'] = df['middleast']*df['temp6t4']
df['america_dev'] = df['america']*df['temp6t4']
df['africa_dev'] = df['africa']*df['temp6t4']
df['asia_dev'] = df['asia']*df['temp6t4']
df['europe_dev'] = df['europe']*df['temp6t4']

In [None]:
#Create a categorical variable for location and group locations into regions
northeast = ['NEWARK', 'BOSTON', 'NEW YORK CITY', 'BUFFALO', 'PHILADELPHIA', 
    'NEW YORK ANNEX', 'NY DET (VARICK ST.)', 'HARTFORD', 
    '*PA DOC.', 'CLEVELAND', '*BOP  DANBURY', '*RI  DOC',
    '*WISCONSIN DOC', '*NH  DOC', '*SUFFOLK COUNTY','*NEWARK VIDEO HEARINGS','*JESSUP'
    '*BOP ALLENWOOD', '*NORTHERN STATE NJ DOC','YORK COUNTY DET','YORK COUNTY DET']

midwest = ['CHICAGO', 'DETROIT', 'CINCINNATI', 'CLEVELAND', 'ST. LOUIS', 
    'MEMPHIS', 'KANSAS CITY', 'OMAHA', '*MI  DOC', 
    '*IL DOC - STATESVILLE', '*MO DOC', '*OHIO DOC', 
    '*INDIANA YOUTH CENTER']

south = ['ARLINGTON', 'DALLAS', 'HOUSTON', 'MIAMI', 'ATLANTA', 
    'NEW ORLEANS', 'SAN ANTONIO', 'DALLAS DET', 'SAN ANTONIO DET', 
    'HOUSTON DET', 'ATLANTA DET', '*GEORGIA DOC', '*VA DOC', 
    '*DADE COUNTY FL DOC', '*BROWARD  FL DOC', 'ORLANDO', 'KROME DET',
    'PORT ISABEL DET', 'EL PASO', 'EL PASO DET', '*TX DOC', 
    'LOUISVILLE', 'OKLAHOMA CITY', 'OKLAHOMA CITY DET', 
    'BATAVIA SPC', 'BROWARD TRANS CTR','ST. THOMAS', 'ST. CROIX', 'ROLLING PLAINS DETENTION CENTER',
    '*BOP BIG SPRING AIRPARK','BRADENTON DET','SAN ANTONIO DET']

west = ['DENVER', 'SAN DIEGO', 'LOS ANGELES', 'SAN FRANCISCO', 
    'PHOENIX', 'LAS VEGAS', 'RENO', 'SALT LAKE CITY', 'OTAY MESA', 
    'TUCSON', 'HONOLULU', 'SAN JUAN', 'SEATTLE', 'PORTLAND',
    'SAN FRANCISCO DET', 'DENVER DET', 'SAN DIEGO DETAINED', 
    'MIRA LOMA DET', 'HONOLULU DET', '*CO DOC', '*AZ DOC',
    '*WA DOC', '*AK DOC', 'ANCHORAGE', 'SAN PEDRO', 
    'IMPERIAL', '*NM DOC','PORTLAND DET','*MONROE WA DOC','SAN FRANCISCO ANNEX']


# Create the regional variable and set it to 0 by default
df['northeast'] = 0
df['midwest'] = 0
df['south'] = 0
df['west'] = 0


# Replace with 1 for observation where location is in the list of selected regions
df.loc[df['location'].isin(northeast), 'northeast'] = 1
df.loc[df['location'].isin(midwest), 'midwest'] = 1
df.loc[df['location'].isin(south), 'south'] = 1
df.loc[df['location'].isin(west), 'west'] = 1

In [None]:
#Create a date categorical variable
df['year'] = df['date'].dt.year

#create dummy for year
df['year2000'] = 0
df['year2001'] = 0
df['year2002'] = 0
df['year2003'] = 0
df['year2004'] = 0

# Replace with 1 for observations in a specific year
df.loc[df['year'] == 2000, 'year2000'] = 1
df.loc[df['year'] == 2001, 'year2001'] = 1
df.loc[df['year'] == 2002, 'year2002'] = 1
df.loc[df['year'] == 2003, 'year2003'] = 1
df.loc[df['year'] == 2004, 'year2004'] = 1


# Interaction term for location and year
years = [2000, 2001, 2002, 2003, 2004]
locations = ['northeast', 'midwest', 'south', 'west']

for year in years:
    for location in locations:
        df[f'{location}_year{year}'] = df[location] * df[f'year{year}']

In [None]:
# Create dummy variables for the months 

df['month'] = df['date'].dt.month
df = pd.get_dummies(df, columns=['month'], prefix='month', drop_first=False)

In [None]:
#Clean the dataset

#Drop asylum cases with no classification
df = df[df['c_asy_type'].isin(['E', 'I'])]

# Clean dataset by dropping any rows with NA observations
df_final = df.dropna(axis=0) 

In [None]:
#Calculate the summary statistics for the main variables of interest
summary_stats = df_final[['temp6t4','heat','wind6t4','res','dummy_gender']].describe() 
print(summary_stats)

mean_values = df_final[['temp6t4','heat','wind6t4','res','dummy_gender']].mean()
print(mean_values)

In [None]:
#Summary statistics per region to check if the observations are balanced

#List of locations
locations = ['northeast', 'midwest', 'south', 'west']

for location in locations:
    # Filter rows where the location is 1 
    df_filtered = df_final[df_final[location] == 1]
    # Calculate summary statistics for resolution in the filtered data
    summary_stats = df_filtered['res'].describe()
    # Print the summary stats for the current location
    print(f"Summary statistics for {location}:")
    print(summary_stats)
    print("\n")  

## OLS

In [None]:
#Create my Y variable
Y = np.array(df_final['res'])

In [None]:
#Create X variables for different specifications
#Note: drop one category for each dummy


#Specification 1
selectedvariables = ['temp6t4','heat','skycover', 'co', 'co_distance', 'ozone', 'ozone_distance', 'pm', 
                     'pm_distance', 'press6t4', 'dew6t4', 'prcp6t4', 'wind6t4', 
                     'rh6t4', 'chair', 'dummy_asylum', 'dummy_gender', 
                     'middleast', 'america', 'africa', 'europe', 'northeast', 'midwest', 
                     'south', 'year2000', 'year2001', 'year2002', 
                     'year2003','middleast_dev','america_dev','africa_dev','europe_dev','month_1',
                     'month_2','month_3','month_4','month_5','month_6','month_7','month_8',
                     'month_9','month_10','month_11']

#Specification 2
selectedvariables_noweather = ['temp6t4','chair', 'dummy_asylum', 'dummy_gender', 
                     'middleast', 'america', 'africa', 'europe', 'northeast', 'midwest', 
                     'south', 'year2000', 'year2001', 'year2002', 
                     'year2003','middleast_dev','america_dev','africa_dev','europe_dev','month_1','month_2',
                      'month_3','month_4','month_5','month_6','month_7','month_8',
                    'month_9','month_10','month_11']

#Additional Fixed Effects
year_location = ['northeast_year2000', 'northeast_year2001', 'northeast_year2002', 'northeast_year2003', 
                 'northeast_year2004','midwest_year2000', 'midwest_year2001', 'midwest_year2002', 
                 'midwest_year2003', 'midwest_year2004','south_year2000', 'south_year2001', 'south_year2002', 
                 'south_year2003', 'south_year2004','west_year2000', 'west_year2001', 'west_year2002', 
                 'west_year2003']

# Deviation Specification - Future Steps
selectedvariables_deviation = ['deviation','heat','skycover', 'co', 'co_distance', 'ozone', 'ozone_distance', 'pm', 
                     'pm_distance', 'press6t4', 'dew6t4', 'prcp6t4', 'wind6t4', 
                     'rh6t4', 'chair', 'dummy_asylum', 'dummy_gender', 
                     'middleast', 'america', 'africa', 'europe', 'northeast', 'midwest', 
                     'south', 'year2000', 'year2001', 'year2002', 
                     'year2003','middleast_dev','america_dev','africa_dev','europe_dev','month_1',
                     'month_2','month_3','month_4','month_5','month_6','month_7','month_8',
                     'month_9','month_10','month_11']

all_variables = selectedvariables + year_location
                        
#Create X variables with different specification
X = df_final[selectedvariables]

X_no_control = df_final[selectedvariables_noweather]

X_all = df_final[all_variables]

X_deviation = df_final[selectedvariables_deviation]

In [None]:
#Specification 1
model_1 = Probit(Y, X.astype(float))
probit_model1 = model_1.fit()
print(probit_model1.summary())

In [None]:
# Calculate predicted probabilities
predicted_probs = probit_model1.predict(X.astype(float))

# Calculate marginal effect for the variable of interest
x_temp6t4 = X['temp6t4']  
marginal_effect_temp = probit_model1.params['temp6t4'] * predicted_probs * (1 - predicted_probs)
average_marginal_effect_temp = np.mean(marginal_effect_temp)

print(f"Average Marginal Effect: {average_marginal_effect_temp}")

In [None]:
#Specification 1 - Deviation - Future Steps

model_1_deviation = Probit(Y, X_deviation.astype(float))
probit_model1_deviation = model_1_deviation.fit()
print(probit_model1_deviation.summary())

In [None]:
#Specification 2

model_2 = Probit(Y, X_no_control.astype(float))
probit_model2 = model_2.fit()
print(probit_model2.summary())

In [None]:
# Calculate predicted probabilities
predicted_probs = probit_model2.predict(X_no_control.astype(float))

# Calculate marginal effect for the variable of interest 
x_temp6t4 = X['temp6t4']  
marginal_effect_temp = probit_model2.params['temp6t4'] * predicted_probs * (1 - predicted_probs)

# Average marginal effect
average_marginal_effect_temp = np.mean(marginal_effect_temp)

print(f"Average Marginal Effect: {average_marginal_effect_temp}")

In [None]:
#Specification 3 - Additional Fixed Effects - Not included in the report
model_3 = Probit(Y, X_all.astype(float))
probit_model3 = model_3.fit()
print(probit_model3.summary())

In [None]:
#Specification - Future Steps - Per region
#Change the df_final['location'] for each region of interest

df_filtered = df_final[df_final['midwest'] == 1]

Y_location = np.array(df_filtered['res'])  

#Remove the location fixed effects as the filtered dataset only includes one region
selectedvariables = ['temp6t4','heat','skycover', 'co', 'co_distance', 'ozone', 'ozone_distance', 'pm', 
                     'pm_distance', 'press6t4', 'dew6t4', 'prcp6t4', 'wind6t4', 
                     'rh6t4', 'chair', 'dummy_asylum', 'dummy_gender', 
                     'middleast', 'america', 'africa', 'europe', 'year2000', 'year2001', 'year2002', 
                     'year2003','middleast_dev','america_dev','africa_dev','europe_dev','month_1',
                     'month_2','month_3','month_4','month_5','month_6','month_7','month_8',
                     'month_9','month_10','month_11']

X_location = df_filtered[selectedvariables]  

#Run specification 1 with filtered dataset
model_4 = Probit(Y_location, X_location.astype(float))
probit_model4 = model_4.fit()
print(probit_model4.summary())

In [None]:
# Filter the DataFrame per months (winter vs non. winter months)

df_final['date'] = pd.to_datetime(df_final['date'])

#Filter the dataset to include only months we want (winter vs non winter months)
df_filter_nowinter = df_final[df_final['date'].dt.month.isin([3, 4, 5, 6, 7, 8, 9, 10, 11])]
df_filter_winter = df_final[df_final['date'].dt.month.isin([1,2,12])]

# Define selected variables for no winter, drop winter months dummies
selectedvariables_nowinter = ['skycover', 'co', 'co_distance', 'ozone', 'ozone_distance', 'pm', 
                     'pm_distance', 'temp6t4','deviation', 'press6t4', 'dew6t4', 'prcp6t4', 'wind6t4', 
                     'rh6t4', 'heat', 'chair', 'dummy_asylum', 'dummy_gender', 
                     'middleast', 'america', 'africa', 'europe', 'northeast', 'midwest', 
                     'south', 'year2000', 'year2001', 'year2002', 
                     'year2003','middleast_dev','america_dev','africa_dev','europe_dev','month_3',
                     'month_4','month_5','month_6','month_7','month_8',
                     'month_9','month_10']

# Define selected variables for winter, drop non-winter months dummies
selectedvariables_winter = ['skycover', 'co', 'co_distance', 'ozone', 'ozone_distance', 'pm', 
                     'pm_distance', 'temp6t4','deviation', 'press6t4', 'dew6t4', 'prcp6t4', 'wind6t4', 
                     'rh6t4', 'heat', 'chair', 'dummy_asylum', 'dummy_gender', 
                     'middleast', 'america', 'africa', 'europe', 'northeast', 'midwest', 
                     'south', 'year2000', 'year2001', 'year2002', 
                     'year2003','middleast_dev','america_dev','africa_dev','europe_dev','month_1','month_2']


#Create X and Y variables
Y_nowinter = np.array(df_filter_nowinter['res'])  
X_nowinter = df_filter_nowinter[selectedvariables_nowinter]  

Y_winter = np.array(df_filter_winter['res'])  
X_winter = df_filter_winter[selectedvariables_winter]  


In [None]:
# Model with dataframe filtered with no winter months - Future Steps

model_5 = Probit(Y_nowinter, X_nowinter.astype(float))
probit_model5 = model_5.fit()
print(probit_model5.summary())

In [None]:
# Model with dataframe filtered with only winter months - Future Steps

model_6 = Probit(Y_winter, X_winter.astype(float))
probit_model6 = model_6.fit()
print(probit_model6.summary())

The most interesting finding is that temperature has a positive coefficient in our logit model (opposite to the original paper) and almost 0 in the Ridge model. Meanwhile, in the Lasso model temperature has a negative coefficient in line with the original paper. Perhaps when more variables are included the effect of temperature is diminished. 

## Running Ridge 

In [None]:
#Run a Ridge Model
#Code in this section based on the notebook: ISLP-Ch06_varselect_lab.ipynb

coefficients = []
#Calculate lambda from 10^8 to 10^-2
lambdas = 10**np.linspace(8, -2, 100) / Y.std()
scaler = StandardScaler(with_mean=True,  with_std=True)
for lam in lambdas:
    ridge = SGDClassifier(loss='log_loss', penalty='l2',alpha=lam)
    # Create a pipeline with scaling and the classifier
    pipe = Pipeline(steps=[('scaler', scaler), ('ridge', ridge)])
    # Fit the pipeline to the data
    pipe.fit(X, Y)
    # Store the coefficients 
    coefficients.append(pipe.named_steps['ridge'].coef_.flatten())

#solution containing all our coefficients    
soln_array = np.array(coefficients)

# Create a DataFrame with the solution path, for easy  transposing soln_array so features are in columns
soln_path = pd.DataFrame(soln_array, columns=X.columns, index=-np.log(lambdas))
# Name the index to indicate it's the negative log of lambda
soln_path.index.name = 'negative log(lambda)'

In [None]:
#Plot the graph
path_fig, ax = subplots(figsize=(8,8))
soln_path.plot(ax=ax, legend=False)
ax.set_xlabel('$-\log(\lambda)$', fontsize=20)
ax.set_ylabel('Standardized coefficients', fontsize=20)
ax.legend(loc='upper left');

In [None]:
# Initialize K-Fold cross-validation strategy
kfold = KFold(n_splits=5)

param_grid = {'ridge__alpha': lambdas}

# Use GridSearchCV to find the best alpha using cross-validation
#Since we have a categorical variable the scoring is accuracy (where it defines how accurate we predict y)
grid_search = GridSearchCV(pipe, param_grid, cv=kfold, scoring='accuracy', return_train_score=True)

# Fit the model using cross-validation to find the best alpha
grid_search.fit(X, Y)

# Find optimal lambda
tuned_ridge = grid_search.best_estimator_.named_steps['ridge']

# Retrieve the mean and standard deviation of cross-validation scores
mean_scores = grid_search.cv_results_['mean_test_score']
std_scores = grid_search.cv_results_['std_test_score']

In [None]:
# Plotting the Cross-Validated Accuracy and Error Bars
ridgeCV_fig, ax = subplots(figsize=(8, 8))
ax.errorbar(-np.log(lambdas), mean_scores, yerr=std_scores / np.sqrt(kfold.get_n_splits()), fmt='o')
ax.axvline(-np.log(grid_search.best_params_['ridge__alpha']), c='k', ls='--')
ax.set_xlabel('$-\log(\\lambda)$', fontsize=20)
ax.set_ylabel('Cross-validated Accuracy', fontsize=20)
ax.set_title('Cross-Validation Accuracy with Error Bars')

In [None]:
coefficients = tuned_ridge.coef_.flatten()  
variable_names = X.columns  # Get the names of the features

# Create a dictionary mapping variable names to their coefficients
coef_mapping = {variable: coef for variable, coef in zip(variable_names, coefficients)}

# Convert the dictionary to a DataFrame for better visualization
coef_df = pd.DataFrame(list(coef_mapping.items()), columns=['Variable', 'Coefficient'])
print(coef_df)

## Running Lasso

In [None]:
#Code in this section based on the notebook: ISLP-Ch06_varselect_lab.ipynb

#Running Lasso
coefficients_l = []
for lam in lambdas:
    lasso = SGDClassifier(loss='log_loss', penalty='l1',alpha=lam)
    # Create a pipeline with scaling and the classifier
    pipe_l = Pipeline(steps=[('scaler', scaler), ('lasso', lasso)])
    # Fit the pipeline to the data
    pipe_l.fit(X, Y)
    # Store the coefficients 
    coefficients_l.append(pipe_l.named_steps['lasso'].coef_.flatten())

soln_array_l = np.array(coefficients_l)
# Create a DataFrame with the solution path, transposing soln_array so features are in columns
soln_path_l = pd.DataFrame(soln_array_l, columns=X.columns, index=-np.log(lambdas))
# Name the index to indicate it's the negative log of lambda
soln_path_l.index.name = 'negative log(lambda)'

In [None]:
path_fig_l, ax = subplots(figsize=(8,8))
soln_path_l.plot(ax=ax, legend=False)
ax.set_xlabel('$-\log(\lambda)$', fontsize=20)
ax.set_ylabel('Standardized coefficients', fontsize=20)
ax.legend(loc='upper left');

In [None]:
kfold = KFold(n_splits=5)

# Define a grid of alpha values to search over
param_grid_l = {'lasso__alpha': lambdas}

# Use GridSearchCV to find the best alpha using cross-validation
# Since we have a categorical variable the scoring is not MSE but accuracy
grid_search_l = GridSearchCV(pipe_l, param_grid_l, cv=kfold, scoring='accuracy', return_train_score=True)

# Fit the model using cross-validation to find the best alpha
grid_search_l.fit(X, Y)

# Get the tuned LASSO model (SGDClassifier)
tuned_lasso = grid_search_l.best_estimator_.named_steps['lasso']

# Retrieve the mean and standard deviation of cross-validation scores
mean_scores_l = grid_search_l.cv_results_['mean_test_score']
std_scores_l = grid_search_l.cv_results_['std_test_score']

In [None]:
# Plotting the Cross-Validated Accuracy and Error Bars
lassoCV_fig, ax = subplots(figsize=(8, 8))
ax.errorbar(-np.log(lambdas), mean_scores_l, yerr=std_scores_l / np.sqrt(kfold.get_n_splits()), fmt='o')
ax.axvline(-np.log(grid_search_l.best_params_['lasso__alpha']), c='k', ls='--')
ax.set_xlabel('$-\log(\\lambda)$', fontsize=20)
ax.set_ylabel('Cross-validated Accuracy', fontsize=20)
ax.set_title('Cross-Validation Accuracy with Error Bars')

In [None]:
coefficients_l = tuned_lasso.coef_.flatten()  
variable_names = X.columns  # Get the names of the features

# Create a dictionary mapping variable names to their coefficients
coef_mapping_l = {variable: coef for variable, coef in zip(variable_names, coefficients_l)}

# Convert the dictionary to a DataFrame for better visualization
coef_df = pd.DataFrame(list(coef_mapping_l.items()), columns=['Variable', 'Coefficient'])
print(coef_df)

## Regresion Trees 

In [None]:
#Code in this section based on the notebook: ISLP-TreeModels.ipynb

clf = DTC(criterion='entropy', 
          max_depth = 3,
          random_state=0)   

clf.fit(X, Y)

In [None]:
#Identify columns
X.columns

In [None]:
#Rename columns for better labelling of trees
X_detailed = ['Average temperature', 'Heat index', 'Sky coverage', 'Carbon monoxide levels', 
              'Distance CO source', 'Ozone levels', 'Distance ozone source', 'PM levels', 
              'Distance PM source', 'Atmospheric pressure', 'Dew point temperature', 'Precipitation', 
              'Wind speed', 'Relative humidity', 'Judge identifier', 'Asylum application', 
              'Gender', 'Middle Eastern', 'American', 'African', 'European', 'Northeast', 'Midwest', 'South', 
              '2000','2001', '2002', '2003', 'Interaction of temperature and Middle Eastern',
              'Interaction of temperature and American', 'Interaction of temperature and African', 
              'Interaction of temperature and European', 'January', 'February', 'March', 'April', 'May', 'June', 
              'July', 'August', 'September', 'October', 'November']

In [None]:
feature_names = X_detailed

In [None]:
accuracy_score(Y, clf.predict(X)) 

In [None]:
ax = subplots(figsize=(12,12))[1]
plot_tree(clf,
          feature_names=feature_names,
          ax=ax);

In [None]:
#Cross validation 
validation = skm.ShuffleSplit(n_splits=1,
                              test_size=200,
                              random_state=0)
results = skm.cross_validate(clf,
                             X,
                             Y,
                             cv=validation)
results['test_score']

In [None]:
#Split dataset
(X_train,
 X_test,
 Y_train,
 Y_test) = skm.train_test_split(X,
                                   Y,
                                   test_size=0.5,
                                   random_state=0)

In [None]:
clf = DTC(criterion='entropy', random_state=0)
clf.fit(X_train, Y_train)
accuracy_score(Y_test, clf.predict(X_test))

In [None]:
ccp_path = clf.cost_complexity_pruning_path(X_train, Y_train)
kfold = skm.KFold(5,
                  random_state=1,
                  shuffle=True)

## Bagging 

In [None]:
#Code in this section based on the notebook: ISLP-TreeModels.ipynb

bag_temperature = RFC(max_features=X_train.shape[1], random_state=0)
bag_temperature.fit(X_train, Y_train)

In [None]:
bag_temperature = RFC(max_features=X_train.shape[1],
                n_estimators=500,#how many trees you are running
                random_state=0).fit(X_train, Y_train)
y_hat_bag = bag_temperature.predict(X_test)
accuracy_bagging = accuracy_score(Y_test, y_hat_bag)
accuracy_bagging

In [None]:
feature_imp_bag = pd.DataFrame(
    {'importance':bag_temperature.feature_importances_},
    index=feature_names)
feature_imp_bag.sort_values(by='importance', ascending=False)

In [None]:
feature_imp_bag = pd.DataFrame(
    {'importance': bag_temperature.feature_importances_},
    index=feature_names
)

# Sort the feature importances
feature_imp_bag = feature_imp_bag.sort_values(by='importance', ascending=False)

# Plot the feature importances
plt.figure(figsize=(10, 6))
plt.barh(feature_imp_bag.index, feature_imp_bag['importance'], color='skyblue')
plt.xlabel('Importance')
plt.title('Feature Importances (Bagged Model)')
plt.gca().invert_yaxis()  # To display the most important feature at the top
plt.show()

## Random Forests 

In [None]:
#Code in this section based on the notebook: ISLP-TreeModels.ipynb

RF_temperature = RFC(max_features=6,
               random_state=0).fit(X_train, Y_train)
y_hat_RF = RF_temperature.predict(X_test)
accuracy_RF = accuracy_score(Y_test, y_hat_RF)
accuracy_RF

In [None]:
feature_imp = pd.DataFrame(
    {'importance':RF_temperature.feature_importances_},
    index=feature_names)
feature_imp.sort_values(by='importance', ascending=False)

## Boosting

In [None]:
#Code in this section based on the notebook: ISLP-TreeModels.ipynb

boost_temperature = GBC(n_estimators=500,
                   learning_rate=0.001,
                    max_depth = 3,
                   random_state=0)
boost_temperature.fit(X_train, Y_train)

In [None]:
boost_temperature = GBC(n_estimators=500,
                   learning_rate=0.001,
                    max_depth = 3,
                   random_state=0)
boost_temperature.fit(X_train,Y_train)
y_hat_boost = boost_temperature.predict(X_test);
accuracy_boosting = accuracy_score(Y_test, y_hat_boost)
accuracy_boosting

In [None]:
feature_imp = pd.DataFrame(
    {'importance':boost_temperature.feature_importances_},
    index=feature_names)
feature_imp.sort_values(by='importance', ascending=False)

## Directed Acyclic Graph (DAG) and Causal Relationship 

In [None]:
#Code in this section based on the notebook: CIDP-Chapter_04

# Define the graph
sample_gml = """graph [
directed 1

node [
    id 1
    label "Cognitive"
    ]
    
node [
    id 2
    label "Weather"
    ]

node [
    id 4
    label "Resolution"
    ]
    
node [
    id 5
    label "Location"
    ]

node [
    id 6
    label "Judge"
    ]

node [
    id 7
    label "Nationality"
    ]
    
edge [
    source 2
    target 1
    ]

edge [
    source 5
    target 4
    ]

edge [
    source 6
    target 4
    ]

edge [
    source 1
    target 6
    ]
edge [
    source 7
    target 4
    ]
edge [
    source 5
    target 2
    ]
]
    
    """

In [None]:
# Get the graph
graph = nx.parse_gml(sample_gml)

# Plot
nx.draw(
    G=graph, 
    with_labels=True,
    node_size=2500,
    node_color=COLORS[0],
    font_color='black',
    font_size = 8
)

In [None]:
# Define the graph
sample_gml2 = """graph [
directed 1

node [
    id 1
    label "Cognitive"
    ]
    
node [
    id 2
    label "Weather"
    ]

node [
    id 4
    label "Resolution"
    ]
    
node [
    id 5
    label "Location"
    ]

node [
    id 6
    label "Judge"
    ]

node [
    id 7
    label "Nationality"
    ]
    
edge [
    source 2
    target 1
    ]

edge [
    source 5
    target 4
    ]

edge [
    source 6
    target 4
    ]

edge [
    source 1
    target 6
    ]
edge [
    source 7
    target 1
    ]
edge [
    source 5
    target 2
    ]
]
    
    """

In [None]:
# Get the graph
graph = nx.parse_gml(sample_gml2)

# Plot
nx.draw(
    G=graph, 
    with_labels=True,
    node_size=2500,
    node_color=COLORS[0],
    font_color='black',
    font_size = 8
)

In [None]:
# Define the graph
sample_gml3 = """graph [
directed 1

node [
    id 1
    label "Cognitive"
    ]
    
node [
    id 2
    label "Weather"
    ]

node [
    id 4
    label "Resolution"
    ]
    
node [
    id 5
    label "Location"
    ]

node [
    id 6
    label "Judge"
    ]

node [
    id 7
    label "Nationality"
    ]
    
edge [
    source 2
    target 1
    ]

edge [
    source 6
    target 4
    ]

edge [
    source 1
    target 6
    ]
edge [
    source 7
    target 1
    ]
edge [
    source 5
    target 2
    ]
edge [
    source 5
    target 6
    ]
]
    
    """

In [None]:
# Get the graph
graph = nx.parse_gml(sample_gml3)

# Plot
nx.draw(
    G=graph, 
    with_labels=True,
    node_size=2500,
    node_color=COLORS[0],
    font_color='black',
    font_size = 8
)

In [None]:
# Define the graph
gml_final = """graph [
directed 1
    
node [
    id 1
    label "midwest"
    ]

node [
    id 2
    label "deviation"
    ]

node [
    id 4
    label "res"
    ]
    
node [
    id 5
    label "northeast"
    ]

node [
    id 6
    label "chair"
    ]

node [
    id 7
    label "america"
    ]
node [
    id 8
    label "south"
    ]
node [
    id 9
    label "west"
    ]
node [
    id 11
    label "cognitive"
    ]
    
edge [
    source 2
    target 11
    ]

edge [
    source 6
    target 4
    ]

edge [
    source 7
    target 11
    ]
edge [
    source 5
    target 2
    ]
edge [
    source 1
    target 2
    ]
edge [
    source 8
    target 2
    ]
edge [
    source 9
    target 2
    ]
edge [
    source 5
    target 11
    ]
edge [
    source 1
    target 11
    ]
edge [
    source 8
    target 11
    ]
edge [
    source 9
    target 11
    ]
edge [ 
    source 11
    target 6
    ]
]

    
    """

In [None]:
# Get the graph
graph = nx.parse_gml(gml_final)

# Plot
nx.draw(
    G=graph, 
    with_labels=True,
    node_size=2500,
    node_color=COLORS[0],
    font_color='black',
    font_size = 8
)

## Test DAG

In [None]:
#Code in this section based on the notebook:  CIDP-Chapter_07

#Model the problem
model = CausalModel(
data=df_final,
treatment=['deviation'],
outcome="res",
graph=gml_final)

In [None]:
#Identify the estimand
estimand = model.identify_effect()

In [None]:
print(estimand)

In [None]:
#obtain estimates
estimate = model.estimate_effect(
identified_estimand=estimand,
method_name="backdoor.linear_regression")

In [None]:
print(estimate)

In [None]:
#perform refutation test
#Refutation test on whether estimate is influenced by unobserved confounders = random_common_cause 
refute_subset = model.refute_estimate(
estimand=estimand,
estimate=estimate,
method_name="random_common_cause",
subset_fraction=0.4)

In [None]:
print(refute_subset)
#High p-value suuggests that the random common cause does not have a meaningful impact on the relationship between 
#temperature and the outcome, providing confidence in the stability of findings.