In [1]:
#pip install pyforest

The 2012 US Army Anthropometric Survey (ANSUR II) was executed by the Natick Soldier Research, Development and Engineering Center (NSRDEC) from October 2010 to April 2012 and is comprised of personnel representing the total US Army force to include the US Army Active Duty, Reserves, and National Guard. In addition to the anthropometric and demographic data described below, the ANSUR II database also consists of 3D whole body, foot, and head scans of Soldier participants. These 3D data are not publicly available out of respect for the privacy of ANSUR II participants. The data from this survey are used for a wide range of equipment design, sizing, and tariffing applications within the military and has many potential commercial, industrial, and academic applications.

The ANSUR II working databases contain 93 anthropometric measurements which were directly measured, and 15 demographic/administrative variables explained below. The ANSUR II Male working database contains a total sample of 4,082 subjects. The ANSUR II Female working database contains a total sample of 1,986 subjects.


data dict:
https://data.world/datamil/ansur-ii-data-dictionary/workspace/file?filename=ANSUR+II+Databases+Overview.pdf


Hİnt for metric : Our mission to classify soldiers races via their body sclales. We want a balanced score for our predictions.

# Import libraries

In [60]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

In [137]:
from pandas_profiling import ProfileReport
import pandas_profiling as pp #eda için çok kullanışlı ama %100 güvenilmeyebilir
from sklearn.preprocessing import StandardScaler, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV, cross_validate, cross_val_score
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn.metrics import classification_report, confusion_matrix, log_loss, recall_score, roc_auc_score
from yellowbrick.classifier import ConfusionMatrix
from yellowbrick.cluster import KElbowVisualizer

# Ingest the data from links below and make a dataframe
- Soldiers Male : https://query.data.world/s/h3pbhckz5ck4rc7qmt2wlknlnn7esr
- Soldiers Female : https://query.data.world/s/sq27zz4hawg32yfxksqwijxmpwmynq

In [38]:
df_male=pd.read_csv("ANSUR II MALE Public.csv",encoding='latin-1')
df_male.head()

Unnamed: 0,subjectid,abdominalextensiondepthsitting,acromialheight,acromionradialelength,anklecircumference,axillaheight,balloffootcircumference,balloffootlength,biacromialbreadth,bicepscircumferenceflexed,...,Branch,PrimaryMOS,SubjectsBirthLocation,SubjectNumericRace,Ethnicity,DODRace,Age,Heightin,Weightlbs,WritingPreference
0,10027,266,1467,337,222,1347,253,202,401,369,...,Combat Arms,19D,North Dakota,1,,1,41,71,180,Right hand
1,10032,233,1395,326,220,1293,245,193,394,338,...,Combat Support,68W,New York,1,,1,35,68,160,Left hand
2,10033,287,1430,341,230,1327,256,196,427,408,...,Combat Support,68W,New York,2,,2,42,68,205,Left hand
3,10092,234,1347,310,230,1239,262,199,401,359,...,Combat Service Support,88M,Wisconsin,1,,1,31,66,175,Right hand
4,10093,250,1585,372,247,1478,267,224,435,356,...,Combat Service Support,92G,North Carolina,2,,2,21,77,213,Right hand


In [39]:
df_female=pd.read_csv("ANSUR II FEMALE Public.csv",encoding='latin-1')
df_female.head()

Unnamed: 0,SubjectId,abdominalextensiondepthsitting,acromialheight,acromionradialelength,anklecircumference,axillaheight,balloffootcircumference,balloffootlength,biacromialbreadth,bicepscircumferenceflexed,...,Branch,PrimaryMOS,SubjectsBirthLocation,SubjectNumericRace,Ethnicity,DODRace,Age,Heightin,Weightlbs,WritingPreference
0,10037,231,1282,301,204,1180,222,177,373,315,...,Combat Support,92Y,Germany,2,,2,26,61,142,Right hand
1,10038,194,1379,320,207,1292,225,178,372,272,...,Combat Service Support,25U,California,3,Mexican,3,21,64,120,Right hand
2,10042,183,1369,329,233,1271,237,196,397,300,...,Combat Service Support,35D,Texas,1,,1,23,68,147,Right hand
3,10043,261,1356,306,214,1250,240,188,384,364,...,Combat Service Support,25U,District of Columbia,8,Caribbean Islander,2,22,66,175,Right hand
4,10051,309,1303,308,214,1210,217,182,378,320,...,Combat Arms,42A,Texas,1,,1,45,63,195,Right hand


In [6]:
df_male.shape

(4082, 108)

In [7]:
df_female.shape

(1986, 108)

In [21]:
df_male.columns

Index(['subjectid', 'abdominalextensiondepthsitting', 'acromialheight',
       'acromionradialelength', 'anklecircumference', 'axillaheight',
       'balloffootcircumference', 'balloffootlength', 'biacromialbreadth',
       'bicepscircumferenceflexed',
       ...
       'Branch', 'PrimaryMOS', 'SubjectsBirthLocation', 'SubjectNumericRace',
       'Ethnicity', 'DODRace', 'Age', 'Heightin', 'Weightlbs',
       'WritingPreference'],
      dtype='object', length=108)

In [22]:
df_female.columns

Index(['SubjectId', 'abdominalextensiondepthsitting', 'acromialheight',
       'acromionradialelength', 'anklecircumference', 'axillaheight',
       'balloffootcircumference', 'balloffootlength', 'biacromialbreadth',
       'bicepscircumferenceflexed',
       ...
       'Branch', 'PrimaryMOS', 'SubjectsBirthLocation', 'SubjectNumericRace',
       'Ethnicity', 'DODRace', 'Age', 'Heightin', 'Weightlbs',
       'WritingPreference'],
      dtype='object', length=108)

In [23]:
df_male.columns == df_female.columns


array([False,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True,
        True,  True,  True,  True,  True,  True,  True,  True,  True])

# EDA
Tips :
- Drop unnecessary colums
- Drop DODRace class if value count below 500 (we assume that our data model can't learn if it is below 500)
- Find unusual value in Weightlbs

Let's compare two data and start EDA

In [8]:
df_male.info(verbose=True)
print("------------------------------------------------")
df_female.info(verbose=True)

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 4082 entries, 0 to 4081
Data columns (total 108 columns):
 #   Column                          Dtype 
---  ------                          ----- 
 0   subjectid                       int64 
 1   abdominalextensiondepthsitting  int64 
 2   acromialheight                  int64 
 3   acromionradialelength           int64 
 4   anklecircumference              int64 
 5   axillaheight                    int64 
 6   balloffootcircumference         int64 
 7   balloffootlength                int64 
 8   biacromialbreadth               int64 
 9   bicepscircumferenceflexed       int64 
 10  bicristalbreadth                int64 
 11  bideltoidbreadth                int64 
 12  bimalleolarbreadth              int64 
 13  bitragionchinarc                int64 
 14  bitragionsubmandibulararc       int64 
 15  bizygomaticbreadth              int64 
 16  buttockcircumference            int64 
 17  buttockdepth                    int64 
 18  buttock

In [40]:
df=pd.concat([df_male,df_female], axis=0) # birleştirme yapıldı ancak data_dict birleştirmenin çok yanlış sonuçları olacağını brlitttiğinden vazgeçilldi.
df.head()

Unnamed: 0,subjectid,abdominalextensiondepthsitting,acromialheight,acromionradialelength,anklecircumference,axillaheight,balloffootcircumference,balloffootlength,biacromialbreadth,bicepscircumferenceflexed,...,PrimaryMOS,SubjectsBirthLocation,SubjectNumericRace,Ethnicity,DODRace,Age,Heightin,Weightlbs,WritingPreference,SubjectId
0,10027.0,266,1467,337,222,1347,253,202,401,369,...,19D,North Dakota,1,,1,41,71,180,Right hand,
1,10032.0,233,1395,326,220,1293,245,193,394,338,...,68W,New York,1,,1,35,68,160,Left hand,
2,10033.0,287,1430,341,230,1327,256,196,427,408,...,68W,New York,2,,2,42,68,205,Left hand,
3,10092.0,234,1347,310,230,1239,262,199,401,359,...,88M,Wisconsin,1,,1,31,66,175,Right hand,
4,10093.0,250,1585,372,247,1478,267,224,435,356,...,92G,North Carolina,2,,2,21,77,213,Right hand,


In [48]:
drop_columns=["subjectid","SubjectNumericRace","Ethnicity","SubjectsBirthLocation"\
              ,"WritingPreference","Date","Installation","Component","Branch","PrimaryMOS","SubjectId"]

In [42]:
df.Ethnicity.value_counts(normalize=True)

Mexican                        0.251232
Puerto Rican                   0.128783
Caribbean Islander             0.087262
Filipino                       0.047150
Cherokee                       0.045742
                                 ...   
Chippewa Mexican               0.000704
Caribbean Islander Cherokee    0.000704
Tsimshian                      0.000704
Colombian Cuban                0.000704
Cheyenne River Sioux           0.000704
Name: Ethnicity, Length: 209, dtype: float64

In [43]:
df.Gender.value_counts(normalize=True)

Male      0.672709
Female    0.327291
Name: Gender, dtype: float64

In [44]:
df.Weightlbs.value_counts()

170    234
150    233
180    232
160    215
175    211
      ... 
292      1
276      1
272      1
268      1
0        1
Name: Weightlbs, Length: 188, dtype: int64

In [26]:
df['DODRace'].value_counts()

1    3792
2    1298
3     679
4     188
6      59
5      49
8       3
Name: DODRace, dtype: int64

In [45]:
df1=df.drop(df.loc[(df['DODRace']==5) | (df['DODRace']==6)| (df['DODRace']==8)| (df['DODRace']==4)].index, axis=0)

In [46]:
df1

Unnamed: 0,subjectid,abdominalextensiondepthsitting,acromialheight,acromionradialelength,anklecircumference,axillaheight,balloffootcircumference,balloffootlength,biacromialbreadth,bicepscircumferenceflexed,...,PrimaryMOS,SubjectsBirthLocation,SubjectNumericRace,Ethnicity,DODRace,Age,Heightin,Weightlbs,WritingPreference,SubjectId
0,10027.0,266,1467,337,222,1347,253,202,401,369,...,19D,North Dakota,1,,1,41,71,180,Right hand,
1,10032.0,233,1395,326,220,1293,245,193,394,338,...,68W,New York,1,,1,35,68,160,Left hand,
2,10033.0,287,1430,341,230,1327,256,196,427,408,...,68W,New York,2,,2,42,68,205,Left hand,
3,10092.0,234,1347,310,230,1239,262,199,401,359,...,88M,Wisconsin,1,,1,31,66,175,Right hand,
4,10093.0,250,1585,372,247,1478,267,224,435,356,...,92G,North Carolina,2,,2,21,77,213,Right hand,
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981,,285,1392,335,223,1297,252,196,368,299,...,42A,Texas,3,Mexican,3,51,67,180,Right hand,29501.0
1982,,262,1324,301,202,1204,236,181,378,294,...,25U,Texas,3,Mexican,3,40,63,150,Right hand,29502.0
1983,,260,1334,318,213,1259,234,183,363,332,...,42A,Texas,2,,2,40,66,168,Right hand,29503.0
1984,,205,1293,302,199,1207,213,163,376,276,...,92Y,Texas,3,Mexican,3,31,63,133,Right hand,29511.0


In [50]:
df2=df1.drop(drop_columns, axis=1)

In [55]:
df2.head(5)

Unnamed: 0,abdominalextensiondepthsitting,acromialheight,acromionradialelength,anklecircumference,axillaheight,balloffootcircumference,balloffootlength,biacromialbreadth,bicepscircumferenceflexed,bicristalbreadth,...,waistfrontlengthsitting,waistheightomphalion,weightkg,wristcircumference,wristheight,Gender,DODRace,Age,Heightin,Weightlbs
0,266,1467,337,222,1347,253,202,401,369,274,...,440,1054,815,175,853,Male,1,41,71,180
1,233,1395,326,220,1293,245,193,394,338,257,...,371,1054,726,167,815,Male,1,35,68,160
2,287,1430,341,230,1327,256,196,427,408,261,...,411,1041,929,180,831,Male,2,42,68,205
3,234,1347,310,230,1239,262,199,401,359,262,...,399,968,794,176,793,Male,1,31,66,175
4,250,1585,372,247,1478,267,224,435,356,263,...,379,1245,946,188,954,Male,2,21,77,213


In [58]:
df2.Weightlbs.value_counts()

170    218
150    211
180    211
175    198
160    197
      ... 
101      1
292      1
276      1
272      1
0        1
Name: Weightlbs, Length: 187, dtype: int64

In [62]:
df2.weightkg.value_counts()

791     24
842     23
695     22
882     21
797     21
        ..
1157     1
1153     1
1149     1
1141     1
400      1
Name: weightkg, Length: 770, dtype: int64

In [68]:
df2=pd.get_dummies(df2)

In [70]:
df2.shape

(5589, 99)

# DATA Preprocessing

In [71]:
from sklearn.model_selection import train_test_split

In [72]:
X=df2.drop('DODRace', axis=1)
y=df2['DODRace']

In [73]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [74]:
scaler = StandardScaler()

In [75]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

# Modelling Implementing
- You can use pipeline (optional)
- You can research over/undersampling methods and after selecting the best model, examine it to see if better scores can be obtained. (https://imbalanced-learn.org/stable/introduction.html)

## 1. Logistic Regression

# With Default Parameters

In [76]:
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report, plot_confusion_matrix

In [143]:
log_model=LogisticRegression(max_iter=10000, multi_class='ovr')
log_model.fit(X_train_scaled, y_train)
y_pred=log_model.predict(X_test_scaled)
log_f1 = f1_score(y_test, y_pred,average='micro')
log_recall = recall_score(y_test, y_pred,average='micro')
#log_auc = roc_auc_score(y_test, y_pred,multi_class='ovr')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1074   12   21]
 [  28  359    8]
 [ 101   21   53]]
              precision    recall  f1-score   support

           1       0.89      0.97      0.93      1107
           2       0.92      0.91      0.91       395
           3       0.65      0.30      0.41       175

    accuracy                           0.89      1677
   macro avg       0.82      0.73      0.75      1677
weighted avg       0.87      0.89      0.87      1677



# Cross Validate

In [80]:
from sklearn.model_selection import cross_validate

In [82]:
model = LogisticRegression(max_iter=1000000)

scores = cross_validate(model, X_train_scaled, y_train, scoring = ['accuracy', 'precision_weighted','recall_weighted',
                                                                   'f1_weighted'], cv = 10)
df2_scores = pd.DataFrame(scores, index = range(1, 11))
df2_scores.mean()[2:]

test_accuracy              0.871678
test_precision_weighted    0.860992
test_recall_weighted       0.871678
test_f1_weighted           0.861526
dtype: float64

In [84]:
df2_scores.T

Unnamed: 0,1,2,3,4,5,6,7,8,9,10
fit_time,1.494143,1.589852,1.280331,1.229868,1.279065,1.644857,1.505957,1.479106,1.652657,1.756929
score_time,0.007994,0.021987,0.006993,0.008987,0.006991,0.007993,0.005,0.007993,0.007993,0.02399
test_accuracy,0.862245,0.875,0.877238,0.861893,0.86445,0.892583,0.890026,0.861893,0.86445,0.867008
test_precision_weighted,0.857609,0.867929,0.866459,0.850922,0.853491,0.882314,0.879637,0.845779,0.84809,0.857689
test_recall_weighted,0.862245,0.875,0.877238,0.861893,0.86445,0.892583,0.890026,0.861893,0.86445,0.867008
test_f1_weighted,0.859645,0.868577,0.868318,0.851838,0.854325,0.882515,0.872316,0.851365,0.848421,0.857939


# With Best Parameters (GridsearchCV)

In [107]:
log_model = LogisticRegression(solver='saga',multi_class="ovr",max_iter=10000)

In [108]:
penalty = ["elasticnet"]
l1_ratio = np.linspace(0, 1, 5)
C = np.logspace(0, 10, 5)

param_grid = {"penalty" : penalty,
             "l1_ratio" : l1_ratio,
             "C" : C}

In [109]:
grid_model = GridSearchCV(log_model, param_grid = param_grid,cv=3, scoring="recall_macro")

In [110]:
grid_model.fit(X_train_scaled,y_train)

GridSearchCV(cv=3,
             estimator=LogisticRegression(max_iter=10000, multi_class='ovr',
                                          solver='saga'),
             param_grid={'C': array([1.00000000e+00, 3.16227766e+02, 1.00000000e+05, 3.16227766e+07,
       1.00000000e+10]),
                         'l1_ratio': array([0.  , 0.25, 0.5 , 0.75, 1.  ]),
                         'penalty': ['elasticnet']},
             scoring='recall_macro')

In [111]:
grid_model.best_params_

{'C': 1.0, 'l1_ratio': 0.0, 'penalty': 'elasticnet'}

In [112]:
y_pred = grid_model.predict(X_test_scaled)
y_pred

array([2, 1, 2, ..., 1, 1, 1], dtype=int64)

In [113]:
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred, zero_division=1))  # zero division düzenleme knusunda ikaz verdi.

[[1075   11   21]
 [  29  358    8]
 [ 101   21   53]]
              precision    recall  f1-score   support

           1       0.89      0.97      0.93      1107
           2       0.92      0.91      0.91       395
           3       0.65      0.30      0.41       175

    accuracy                           0.89      1677
   macro avg       0.82      0.73      0.75      1677
weighted avg       0.87      0.89      0.87      1677



## 2. Support Vector Classifier

In [145]:
svm_model = SVC()
svm_model.fit(X_train_scaled, y_train)
y_pred = svm_model.predict(X_test_scaled)
svc_f1 = f1_score(y_test, y_pred,average='micro')
svc_recall = recall_score(y_test, y_pred,average='micro')
#svc_auc = roc_auc_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1086   14    7]
 [  35  359    1]
 [ 124   21   30]]
              precision    recall  f1-score   support

           1       0.87      0.98      0.92      1107
           2       0.91      0.91      0.91       395
           3       0.79      0.17      0.28       175

    accuracy                           0.88      1677
   macro avg       0.86      0.69      0.71      1677
weighted avg       0.87      0.88      0.85      1677



# CV

In [91]:
scores = cross_validate(sv_model, X_train_scaled, y_train, cv=10, n_jobs=-1, scoring="recall_macro")

In [92]:
pd.DataFrame(scores)

Unnamed: 0,fit_time,score_time,test_score
0,3.251042,0.234866,0.683485
1,3.17014,0.198888,0.679677
2,3.115119,0.204885,0.711027
3,3.067904,0.223874,0.653825
4,3.203223,0.233862,0.684058
5,3.244195,0.225869,0.685195
6,3.306107,0.200886,0.675221
7,2.967301,0.169902,0.682958
8,2.010846,0.138921,0.676798
9,2.00085,0.13792,0.65869


In [93]:
pd.DataFrame(scores)['test_score'].mean()

0.6790934036208991

## 3. Random Forest

# Decision Tree

In [114]:
def train_val(y_train, y_train_pred, y_test, y_pred):
    
    scores = {"train_set": {"Accuracy" : accuracy_score(y_train, y_train_pred),
                            "Precision" : precision_score(y_train, y_train_pred),
                            "Recall" : recall_score(y_train, y_train_pred),                          
                            "f1" : f1_score(y_train, y_train_pred)},
    
              "test_set": {"Accuracy" : accuracy_score(y_test, y_pred),
                           "Precision" : precision_score(y_test, y_pred),
                           "Recall" : recall_score(y_test, y_pred),                          
                           "f1" : f1_score(y_test, y_pred)}}
    
    return pd.DataFrame(scores)

In [115]:
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, recall_score, precision_score, f1_score
from sklearn.model_selection import cross_val_score, cross_validate
from sklearn.model_selection import GridSearchCV

In [147]:
dt_model = DecisionTreeClassifier(random_state=101)
dt_model.fit(X_train, y_train)
y_pred = dt_model.predict(X_test)
dt_f1 = f1_score(y_test, y_pred,average='micro')
dt_recall = recall_score(y_test, y_pred,average='micro')
#dt_auc = roc_auc_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[866  91 150]
 [100 249  46]
 [107  33  35]]
              precision    recall  f1-score   support

           1       0.81      0.78      0.79      1107
           2       0.67      0.63      0.65       395
           3       0.15      0.20      0.17       175

    accuracy                           0.69      1677
   macro avg       0.54      0.54      0.54      1677
weighted avg       0.71      0.69      0.70      1677



In [117]:
dt_model.feature_importances_

array([0.00577858, 0.00218575, 0.0017419 , 0.011574  , 0.00699687,
       0.0038839 , 0.00080705, 0.00261235, 0.01041771, 0.03995093,
       0.00359291, 0.00679942, 0.02882208, 0.00763604, 0.00601273,
       0.00410193, 0.        , 0.00791015, 0.01971166, 0.02500866,
       0.00204658, 0.0037922 , 0.01390358, 0.00941522, 0.00386917,
       0.00475096, 0.00341003, 0.00620626, 0.00668077, 0.00563253,
       0.01117102, 0.02144223, 0.0999391 , 0.01494662, 0.00624909,
       0.00365038, 0.01162131, 0.00685943, 0.00180293, 0.02256423,
       0.00467936, 0.00426901, 0.00998464, 0.02266294, 0.00231712,
       0.01417786, 0.00531926, 0.00075661, 0.05953305, 0.00453728,
       0.00389068, 0.00483073, 0.064618  , 0.00318504, 0.01119687,
       0.00379564, 0.00084067, 0.00143847, 0.00544866, 0.00332596,
       0.00430798, 0.00209845, 0.00838059, 0.00828322, 0.0053825 ,
       0.00690897, 0.00820321, 0.00080705, 0.01143613, 0.0138706 ,
       0.07715162, 0.00961656, 0.00225815, 0.00488408, 0.00156

In [118]:
dt_feature_imp = pd.DataFrame(index = X.columns, data = dt_model.feature_importances_,
                              columns = ["Feature Importance"]).sort_values("Feature Importance", ascending = False)
dt_feature_imp

Unnamed: 0,Feature Importance
elbowrestheight,0.099939
sittingheight,0.077152
interpupillarybreadth,0.064618
heelbreadth,0.059533
bicristalbreadth,0.039951
...,...
shouldercircumference,0.000807
balloffootlength,0.000807
heelanklecircumference,0.000757
buttockdepth,0.000000


# Random Forest

In [95]:
from sklearn.ensemble import RandomForestClassifier

In [148]:
rf_model = RandomForestClassifier(random_state=42)
rf_model.fit(X_train,y_train)
y_pred = rf_model.predict(X_test)
rf_f1 = f1_score(y_test, y_pred,average='micro')
rf_recall = recall_score(y_test, y_pred,average='micro')
#rf_auc = roc_auc_score(y_test, y_pred)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1077   22    8]
 [ 101  293    1]
 [ 145   14   16]]
              precision    recall  f1-score   support

           1       0.81      0.97      0.89      1107
           2       0.89      0.74      0.81       395
           3       0.64      0.09      0.16       175

    accuracy                           0.83      1677
   macro avg       0.78      0.60      0.62      1677
weighted avg       0.81      0.83      0.79      1677



# RF Model Feature Importance

In [169]:
rf_model.feature_importances_

array([0.00572955, 0.00821675, 0.00603517, 0.00862839, 0.00711669,
       0.0065387 , 0.00685627, 0.00676232, 0.00763128, 0.03237557,
       0.00549381, 0.00572629, 0.02217579, 0.00844468, 0.01125011,
       0.00733712, 0.0066534 , 0.00902841, 0.01558175, 0.01440053,
       0.00676739, 0.00819287, 0.00959445, 0.00731729, 0.00654319,
       0.00667366, 0.00850166, 0.00753565, 0.00777437, 0.00507701,
       0.01182332, 0.00890058, 0.03377648, 0.03870817, 0.00649993,
       0.0086129 , 0.01692968, 0.00677335, 0.00597446, 0.02018823,
       0.00641091, 0.00597018, 0.00757613, 0.01656817, 0.00671211,
       0.01516115, 0.00896093, 0.00692025, 0.03961803, 0.00745355,
       0.00652734, 0.00648272, 0.02997067, 0.00663717, 0.0061769 ,
       0.00657335, 0.005911  , 0.0073403 , 0.01409769, 0.00582689,
       0.00684645, 0.00725336, 0.0078307 , 0.00783738, 0.01328657,
       0.00499913, 0.01996338, 0.00687289, 0.00699968, 0.00616383,
       0.04101948, 0.0062058 , 0.00635389, 0.01150909, 0.01045

In [170]:
rf_feature_imp = pd.DataFrame(index = X.columns, data = rf_model.feature_importances_,
                              columns = ["Feature Importance"]).sort_values("Feature Importance", ascending = False)
rf_feature_imp

Unnamed: 0,Feature Importance
sittingheight,0.041019
heelbreadth,0.039618
eyeheightsitting,0.038708
elbowrestheight,0.033776
bicristalbreadth,0.032376
...,...
weightkg,0.005066
poplitealheight,0.004999
Gender_Male,0.004940
Weightlbs,0.004858


## 4. XGBoost

In [97]:
from xgboost import XGBClassifier

In [98]:
xgb = XGBClassifier(random_state=42).fit(X_train, y_train)





In [99]:
y_pred = xgb.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

[[1062   21   24]
 [  48  339    8]
 [ 113   17   45]]
              precision    recall  f1-score   support

           1       0.87      0.96      0.91      1107
           2       0.90      0.86      0.88       395
           3       0.58      0.26      0.36       175

    accuracy                           0.86      1677
   macro avg       0.78      0.69      0.72      1677
weighted avg       0.85      0.86      0.85      1677



In [106]:
y_train_pred = xgb.predict(X_train)
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

[[2562    0    0]
 [   0  866    0]
 [   0    0  484]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      2562
           2       1.00      1.00      1.00       866
           3       1.00      1.00      1.00       484

    accuracy                           1.00      3912
   macro avg       1.00      1.00      1.00      3912
weighted avg       1.00      1.00      1.00      3912



In [119]:
xgb.feature_importances_

array([0.00431835, 0.00730576, 0.00500046, 0.01072232, 0.01530198,
       0.00732574, 0.00552161, 0.00500628, 0.00702649, 0.02064262,
       0.00568794, 0.00551499, 0.01485362, 0.00701196, 0.01073776,
       0.00775377, 0.00612147, 0.00753791, 0.01968845, 0.0191076 ,
       0.00774772, 0.01242974, 0.00760021, 0.00514153, 0.00610446,
       0.0068718 , 0.00779131, 0.00786371, 0.004228  , 0.0041478 ,
       0.01785598, 0.01523824, 0.05976624, 0.02028464, 0.00865596,
       0.0087466 , 0.01184711, 0.00688345, 0.00619421, 0.01579451,
       0.00640729, 0.01090147, 0.01094166, 0.01529321, 0.00681595,
       0.01324389, 0.01342751, 0.00459261, 0.03420368, 0.0077745 ,
       0.00499412, 0.00649472, 0.03045756, 0.00831944, 0.00413535,
       0.00578914, 0.00459282, 0.00601865, 0.01285164, 0.00703872,
       0.0083021 , 0.00584854, 0.00531411, 0.00470476, 0.01463732,
       0.00438537, 0.01574758, 0.00559496, 0.00728148, 0.00402621,
       0.03987878, 0.00459422, 0.00573036, 0.00494389, 0.01046

In [120]:
feats = pd.DataFrame(index=X.columns,data=xgb.feature_importances_,columns=['Importance'])
feats

Unnamed: 0,Importance
abdominalextensiondepthsitting,0.004318
acromialheight,0.007306
acromionradialelength,0.005000
anklecircumference,0.010722
axillaheight,0.015302
...,...
Age,0.005898
Heightin,0.006206
Weightlbs,0.008674
Gender_Female,0.034602


# Gridsearch¶

In [100]:
param_grid = {"n_estimators":[100, 300],'max_depth':[3,5,6], "learning_rate": [0.1, 0.3],
             "subsample":[0.5, 1], "colsample_bytree":[0.5, 1]}

In [101]:
xgb_model = XGBClassifier(random_state=42)

In [102]:
xgb_grid = GridSearchCV(xgb_model, param_grid, scoring = "f1_micro", verbose=2, n_jobs = -1).fit(X_train, y_train)

Fitting 5 folds for each of 48 candidates, totalling 240 fits


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 4 concurrent workers.
[Parallel(n_jobs=-1)]: Done  33 tasks      | elapsed:  2.5min
[Parallel(n_jobs=-1)]: Done 154 tasks      | elapsed: 13.7min
[Parallel(n_jobs=-1)]: Done 240 out of 240 | elapsed: 26.0min finished




In [126]:
xgb_grid.best_estimator_

XGBClassifier(base_score=0.5, booster='gbtree', colsample_bylevel=1,
              colsample_bynode=1, colsample_bytree=1, gamma=0, gpu_id=-1,
              importance_type='gain', interaction_constraints='',
              learning_rate=0.3, max_delta_step=0, max_depth=5,
              min_child_weight=1, missing=nan, monotone_constraints='()',
              n_estimators=300, n_jobs=4, num_parallel_tree=1,
              objective='multi:softprob', random_state=42, reg_alpha=0,
              reg_lambda=1, scale_pos_weight=None, subsample=0.5,
              tree_method='exact', validate_parameters=1, verbosity=None)

In [127]:
xgb_grid.best_params_

{'colsample_bytree': 1,
 'learning_rate': 0.3,
 'max_depth': 5,
 'n_estimators': 300,
 'subsample': 0.5}

In [128]:
y_pred = xgb_grid.predict(X_test)
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test,y_pred))

[[1060   18   29]
 [  43  339   13]
 [ 111   20   44]]
              precision    recall  f1-score   support

           1       0.87      0.96      0.91      1107
           2       0.90      0.86      0.88       395
           3       0.51      0.25      0.34       175

    accuracy                           0.86      1677
   macro avg       0.76      0.69      0.71      1677
weighted avg       0.84      0.86      0.84      1677



In [129]:
y_train_pred = xgb_grid.predict(X_train)
print(confusion_matrix(y_train, y_train_pred))
print(classification_report(y_train, y_train_pred))

[[2562    0    0]
 [   0  866    0]
 [   0    0  484]]
              precision    recall  f1-score   support

           1       1.00      1.00      1.00      2562
           2       1.00      1.00      1.00       866
           3       1.00      1.00      1.00       484

    accuracy                           1.00      3912
   macro avg       1.00      1.00      1.00      3912
weighted avg       1.00      1.00      1.00      3912



In [149]:
xgb_f1 = f1_score(y_test, y_pred, average='micro')
xgb_recall = recall_score(y_test, y_pred,average='micro')
#xgb_auc = roc_auc_score(y_test, y_pred,average='micro')

# Cross Validate

In [125]:
model = XGBClassifier(n_estimators=300, random_state=42, subsample=1, learning_rate= 0.1, colsample_bytree=0.5,
                                  max_dept=6)

scores = cross_validate(model, X_train, y_train, scoring = ['precision_micro','recall_micro',
                                                                   'f1_micro'], cv = 10)
df_scores = pd.DataFrame(scores, index = range(1, 11))
df_scores



Parameters: { "max_dept" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "max_dept" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "max_dept" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  passed down to XGBoost core.  Or some parameters are not used but slip through this
  verification. Please open an issue if you find above cases.


Parameters: { "max_dept" } might not be used.

  This may not be accurate due to some parameters are only used in language bindings but
  pass

Unnamed: 0,fit_time,score_time,test_precision_micro,test_recall_micro,test_f1_micro
1,12.780246,0.028964,0.836735,0.836735,0.836735
2,12.800675,0.038976,0.826531,0.826531,0.826531
3,13.728145,0.027981,0.84399,0.84399,0.84399
4,13.634104,0.028982,0.831202,0.831202,0.831202
5,13.289605,0.026964,0.85422,0.85422,0.85422
6,12.486924,0.048831,0.851662,0.851662,0.851662
7,12.246152,0.026984,0.869565,0.869565,0.869565
8,11.658325,0.028984,0.849105,0.849105,0.849105
9,12.088083,0.028982,0.85422,0.85422,0.85422
10,11.870784,0.027984,0.841432,0.841432,0.841432


In [130]:
df_scores.mean()[2:]

test_precision_micro    0.845866
test_recall_micro       0.845866
test_f1_micro           0.845866
dtype: float64

# Choose the best model based on the metric you choose and make a random prediction

In [172]:
log_model=LogisticRegression(solver='saga', max_iter=10000, multi_class='ovr')
log_model.fit(X_train_scaled,y_train)
y_pred=log_model.predict(X_test_scaled)
log_f1 = f1_score(y_test, y_pred,average='micro')
log_recall = recall_score(y_test, y_pred,average='micro')
#log_auc = roc_auc_score(y_test, y_pred,multi_class='ovr')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1075   11   21]
 [  29  358    8]
 [ 101   21   53]]
              precision    recall  f1-score   support

           1       0.89      0.97      0.93      1107
           2       0.92      0.91      0.91       395
           3       0.65      0.30      0.41       175

    accuracy                           0.89      1677
   macro avg       0.82      0.73      0.75      1677
weighted avg       0.87      0.89      0.87      1677



In [173]:
dt_feature_imp = pd.DataFrame(index = X.columns, data = dt_model.feature_importances_,
                              columns = ["Feature Importance"]).sort_values("Feature Importance", ascending = False)
dt_feature_imp

Unnamed: 0,Feature Importance
elbowrestheight,0.099939
sittingheight,0.077152
interpupillarybreadth,0.064618
heelbreadth,0.059533
bicristalbreadth,0.039951
...,...
shouldercircumference,0.000807
balloffootlength,0.000807
heelanklecircumference,0.000757
buttockdepth,0.000000


In [215]:
dt_feature_imp.loc[dt_feature_imp["Feature Importance"] > 0.001].index

Index(['elbowrestheight', 'sittingheight', 'interpupillarybreadth',
       'heelbreadth', 'bicristalbreadth', 'bitragionchinarc',
       'buttockpopliteallength', 'handlength', 'forearmhandlength',
       'earprotrusion', 'buttockkneelength', 'eyeheightsitting',
       'headcircumference', 'chestbreadth', 'shoulderlength', 'waistbreadth',
       'wristcircumference', 'waistbacklength', 'forearmcenterofgriplength',
       'anklecircumference', 'shoulderelbowlength', 'interscyeii', 'earlength',
       'thighclearance', 'wristheight', 'bicepscircumferenceflexed',
       'handcircumference', 'sleevelengthspinewrist', 'chestcircumference',
       'thighcircumference', 'tibialheight', 'trochanterionheight',
       'tragiontopofhead', 'neckcircumferencebase', 'waistfrontlengthsitting',
       'Age', 'overheadfingertipreachsitting', 'radialestylionlength',
       'buttockheight', 'bitragionsubmandibulararc', 'thumbtipreach',
       'axillaheight', 'poplitealheight', 'forearmcircumferenceflexed

In [223]:
final=df2[dt_feature_imp.loc[dt_feature_imp["Feature Importance"] > 0.001].index]

In [224]:
final['DODRace']=df2['DODRace']

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: https://pandas.pydata.org/pandas-docs/stable/user_guide/indexing.html#returning-a-view-versus-a-copy
  final['DODRace']=df2['DODRace']


In [225]:
final

Unnamed: 0,elbowrestheight,sittingheight,interpupillarybreadth,heelbreadth,bicristalbreadth,bitragionchinarc,buttockpopliteallength,handlength,forearmhandlength,earprotrusion,...,headbreadth,sleeveoutseam,acromialheight,neckcircumference,calfcircumference,forearmforearmbreadth,acromionradialelength,stature,lateralfemoralepicondyleheight,DODRace
0,247,928,685,70,274,319,509,193,477,19,...,150,600,1467,400,373,575,337,1776,500,1
1,232,884,620,72,257,344,468,195,476,18,...,146,564,1395,380,357,523,326,1702,488,1
2,237,917,665,70,261,345,506,203,491,23,...,148,604,1430,403,412,575,341,1735,482,2
3,272,903,640,68,262,328,437,194,467,25,...,158,550,1347,407,395,593,310,1655,452,1
4,188,919,675,69,263,340,567,218,550,19,...,153,641,1585,398,425,605,372,1914,585,2
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
1981,210,865,615,75,312,306,541,193,456,19,...,151,555,1392,369,373,514,335,1687,486,3
1982,274,881,650,61,287,309,464,183,440,23,...,160,525,1324,359,375,514,301,1613,457,3
1983,204,841,640,72,299,317,525,191,459,18,...,144,566,1334,352,370,530,318,1644,476,2
1984,222,854,640,59,280,282,480,176,425,21,...,149,530,1293,326,370,430,302,1616,445,3


In [226]:
X=final.drop('DODRace', axis=1)
y=final['DODRace']

In [227]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=101)

In [228]:
X_train_scaled = scaler.fit_transform(X_train)
X_test_scaled = scaler.transform(X_test)

In [229]:
log_model=LogisticRegression(solver='saga', max_iter=10000, multi_class='ovr')
log_model.fit(X_train_scaled,y_train)
y_pred=log_model.predict(X_test_scaled)
log_f1 = f1_score(y_test, y_pred,average='micro')
log_recall = recall_score(y_test, y_pred,average='micro')
#log_auc = roc_auc_score(y_test, y_pred,multi_class='ovr')
print(confusion_matrix(y_test, y_pred))
print(classification_report(y_test, y_pred))

[[1071   12   24]
 [  27  359    9]
 [ 101   20   54]]
              precision    recall  f1-score   support

           1       0.89      0.97      0.93      1107
           2       0.92      0.91      0.91       395
           3       0.62      0.31      0.41       175

    accuracy                           0.88      1677
   macro avg       0.81      0.73      0.75      1677
weighted avg       0.87      0.88      0.87      1677



---
---