In [51]:
import pandas as pd
from sklearn.preprocessing import LabelEncoder
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score

In [52]:
data = pd.read_csv('crush_dead.csv')
df = pd.DataFrame(data=data)

df.head()

Unnamed: 0,dvcat,weight,dead,airbag,seatbelt,frontal,sex,ageOFocc,yearacc,yearVeh,abcat,occRole,deploy,caseid
0,40-54,18.895,alive,airbag,belted,0,m,25,2002,2000.0,deploy,pass,1,81:99:1
1,25-39,266.532,alive,none,none,1,f,28,2001,1991.0,unavail,driver,0,76:12:2
2,25-39,51.81,alive,airbag,belted,1,f,36,1999,1994.0,deploy,driver,1,78:53:2
3,10-24,1567.626,alive,airbag,belted,0,f,24,2002,1994.0,nodeploy,driver,0,11:47:1
4,25-39,31.342,alive,none,none,1,f,46,1997,1990.0,unavail,pass,0,12:87:2


In [53]:
df.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 8652 entries, 0 to 8651
Data columns (total 14 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   dvcat     8652 non-null   object 
 1   weight    8652 non-null   float64
 2   dead      8652 non-null   object 
 3   airbag    8652 non-null   object 
 4   seatbelt  8652 non-null   object 
 5   frontal   8652 non-null   int64  
 6   sex       8652 non-null   object 
 7   ageOFocc  8652 non-null   int64  
 8   yearacc   8652 non-null   int64  
 9   yearVeh   8651 non-null   float64
 10  abcat     8652 non-null   object 
 11  occRole   8652 non-null   object 
 12  deploy    8652 non-null   int64  
 13  caseid    8652 non-null   object 
dtypes: float64(2), int64(4), object(8)
memory usage: 946.4+ KB


# Means of Columns

- dvcat : age category
- weight : weight of car
- dead : dead or alive person
- airbag : is there an airbag
- seatbelt : seatbelt usage
- frontal : is the crush from the frontal
- sex : sex of person
- ageOFocc : age of person
- yearacc : year of crush
- yearVeh : year of vehicle
- abcat	: airbag status
- occRole : role of person
- deploy : airbag deploy status
- caseid : crash id

In [54]:
df = df.drop('caseid', axis=1)

In [55]:
df.head()

Unnamed: 0,dvcat,weight,dead,airbag,seatbelt,frontal,sex,ageOFocc,yearacc,yearVeh,abcat,occRole,deploy
0,40-54,18.895,alive,airbag,belted,0,m,25,2002,2000.0,deploy,pass,1
1,25-39,266.532,alive,none,none,1,f,28,2001,1991.0,unavail,driver,0
2,25-39,51.81,alive,airbag,belted,1,f,36,1999,1994.0,deploy,driver,1
3,10-24,1567.626,alive,airbag,belted,0,f,24,2002,1994.0,nodeploy,driver,0
4,25-39,31.342,alive,none,none,1,f,46,1997,1990.0,unavail,pass,0


In [56]:
df.isnull().sum()

dvcat       0
weight      0
dead        0
airbag      0
seatbelt    0
frontal     0
sex         0
ageOFocc    0
yearacc     0
yearVeh     1
abcat       0
occRole     0
deploy      0
dtype: int64

In [57]:
df = df.dropna(axis=0)

In [58]:
df.isnull().sum()

dvcat       0
weight      0
dead        0
airbag      0
seatbelt    0
frontal     0
sex         0
ageOFocc    0
yearacc     0
yearVeh     0
abcat       0
occRole     0
deploy      0
dtype: int64

In [59]:
df.describe()

Unnamed: 0,weight,frontal,ageOFocc,yearacc,yearVeh,deploy
count,8651.0,8651.0,8651.0,8651.0,8651.0,8651.0
mean,470.100312,0.643047,37.194775,1999.549879,1992.725118,0.330598
std,1626.268409,0.479128,17.837905,1.703286,5.647203,0.470456
min,0.0,0.0,16.0,1997.0,1956.0,0.0
25%,32.824,0.0,22.0,1998.0,1989.0,0.0
50%,87.576,1.0,33.0,2000.0,1994.0,0.0
75%,375.337,1.0,47.0,2001.0,1997.0,1.0
max,57871.595,1.0,97.0,2002.0,2003.0,1.0


In [60]:
label_encoder = LabelEncoder()

columns = ['dvcat', 'dead', 'airbag', 'seatbelt', 'sex', 'abcat', 'occRole']

for column in columns:
    df[column] = label_encoder.fit_transform(df[column])
    df[column] = df[column].astype(int)

In [61]:
df.info()

<class 'pandas.core.frame.DataFrame'>
Index: 8651 entries, 0 to 8651
Data columns (total 13 columns):
 #   Column    Non-Null Count  Dtype  
---  ------    --------------  -----  
 0   dvcat     8651 non-null   int64  
 1   weight    8651 non-null   float64
 2   dead      8651 non-null   int64  
 3   airbag    8651 non-null   int64  
 4   seatbelt  8651 non-null   int64  
 5   frontal   8651 non-null   int64  
 6   sex       8651 non-null   int64  
 7   ageOFocc  8651 non-null   int64  
 8   yearacc   8651 non-null   int64  
 9   yearVeh   8651 non-null   float64
 10  abcat     8651 non-null   int64  
 11  occRole   8651 non-null   int64  
 12  deploy    8651 non-null   int64  
dtypes: float64(2), int64(11)
memory usage: 946.2 KB


In [62]:
X = df.drop('dead',axis=1)
Y = df['dead']

In [63]:
x_train, x_test, y_train, y_test = train_test_split(X,Y, test_size= 0.2, random_state=42)

In [64]:
model = LogisticRegression()

model.fit(x_train, y_train)
pred = model.predict(x_test)

accuracy = accuracy_score(y_test, pred)
print(accuracy)

0.9526285384170999


STOP: TOTAL NO. of ITERATIONS REACHED LIMIT.

Increase the number of iterations (max_iter) or scale the data as shown in:
    https://scikit-learn.org/stable/modules/preprocessing.html
Please also refer to the documentation for alternative solver options:
    https://scikit-learn.org/stable/modules/linear_model.html#logistic-regression
  n_iter_i = _check_optimize_result(
