In [1]:
import pandas as pd
file_path = 'sample_data.csv'
data = pd.read_csv(file_path)

# Display the first few rows of the dataset to understand its structure
data.head()

Unnamed: 0,ResponseID,CrossingSignal,NumberOfCharacters,Saved,LeftHand,Man,Woman,Pregnant,Stroller,OldMan,...,LargeMan,Criminal,MaleExecutive,FemaleExecutive,FemaleAthlete,MaleAthlete,FemaleDoctor,MaleDoctor,Dog,Cat
0,FuBqcfGjNSZTGLmJL,2,1.0,0,0.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
1,FuBqcfGjNSZTGLmJL,0,1.0,1,1.0,1.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0,0.0
2,FuBqbkH5834oodYJA,0,5.0,1,0.0,0.0,2.0,0.0,1.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,1.0,0.0,0.0
3,FuBqbkH5834oodYJA,0,1.0,0,1.0,0.0,0.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0
4,FuBq4j5Kw9tYatn35,0,2.0,1,1.0,0.0,1.0,0.0,0.0,0.0,...,0.0,0.0,0.0,0.0,1.0,0.0,0.0,0.0,0.0,0.0


In [2]:
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import accuracy_score, classification_report
from sklearn.preprocessing import StandardScaler

X = data.drop(columns=['Saved', 'ResponseID', 'CrossingSignal', 'LeftHand'])
y = data['Saved']

#standardizing the features
scaler = StandardScaler()
X_scaled = scaler.fit_transform(X)

#splitting the data into training and test sets
X_train, X_test, y_train, y_test = train_test_split(X_scaled, y, test_size=0.3, random_state=42)

model = LogisticRegression()
model.fit(X_train, y_train)

#predictions on the test set
y_pred = model.predict(X_test)

accuracy = accuracy_score(y_test, y_pred)
report = classification_report(y_test, y_pred)

accuracy, report

(0.6099113813743744,
 '              precision    recall  f1-score   support\n\n           0       0.61      0.63      0.62    104934\n           1       0.61      0.59      0.60    105067\n\n    accuracy                           0.61    210001\n   macro avg       0.61      0.61      0.61    210001\nweighted avg       0.61      0.61      0.61    210001\n')

In [3]:
#model equation
intercept = model.intercept_[0]
coefficients = model.coef_[0]
feature_names = X.columns
coefficients_dict = {feature: coef for feature, coef in zip(feature_names, coefficients)}

intercept, coefficients_dict

(-0.00011980961298883392,
 {'NumberOfCharacters': 0.12200854830199825,
  'Man': 0.044967939314875416,
  'Woman': 0.0812283840291631,
  'Pregnant': 0.11848633466630681,
  'Stroller': 0.12360055940410612,
  'OldMan': -0.1406951525385103,
  'OldWoman': -0.10548868262530094,
  'Boy': 0.1686447164168669,
  'Girl': 0.21158913086943462,
  'Homeless': 0.04845537312404233,
  'LargeWoman': 0.011656024059008343,
  'LargeMan': -0.0418276476562779,
  'Criminal': 0.03796677254431176,
  'MaleExecutive': 0.03040380254482354,
  'FemaleExecutive': 0.08662294685667743,
  'FemaleAthlete': 0.10226548636110902,
  'MaleAthlete': 0.060876660618009885,
  'FemaleDoctor': 0.08621033870731074,
  'MaleDoctor': 0.05069157459304692,
  'Dog': -0.14955948859090462,
  'Cat': -0.17057396480811887})

In [None]:
import pandas as pd
data_even = data.iloc[:-1]
diff_data_even = pd.DataFrame()
for i in range(0, len(data_even), 2):
    scenario_1 = data_even.iloc[i]
    scenario_2 = data_even.iloc[i + 1]

    #the difference between the scenarios in each pair
    diff = scenario_2[X.columns] - scenario_1[X.columns]
    diff['Saved'] = scenario_2['Saved']

    diff_data_even = pd.concat([diff_data_even, diff], ignore_index=True)

X_diff_even = diff_data_even.drop(columns=['Saved'])
y_diff_even = diff_data_even['Saved']

X_diff_even_scaled = scaler.fit_transform(X_diff_even)
X_diff_even_train, X_diff_even_test, y_diff_even_train, y_diff_even_test = train_test_split(X_diff_even_scaled, y_diff_even, test_size=0.3, random_state=42)

model_diff_even = LogisticRegression()
model_diff_even.fit(X_diff_even_train, y_diff_even_train)

#predicting on the transformed test set
y_diff_even_pred = model_diff_even.predict(X_diff_even_test)

accuracy_diff_even = accuracy_score(y_diff_even_test, y_diff_even_pred)
report_diff_even = classification_report(y_diff_even_test, y_diff_even_pred)

accuracy_diff_even, report_diff_even


