In [1]:
import pandas as pd
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.preprocessing import LabelEncoder

In [2]:
# Load the dataset
penguins = pd.read_csv("penguins.csv")
#penguins.dropna(inplace=True)
penguins['Sex']

0        MALE
1      FEMALE
2      FEMALE
3         NaN
4      FEMALE
        ...  
339       NaN
340    FEMALE
341      MALE
342    FEMALE
343      MALE
Name: Sex, Length: 344, dtype: object

In [3]:
penguins['Sex'].unique()


array(['MALE', 'FEMALE', nan, '.'], dtype=object)

In [4]:
# Convert the 'sex' column to binary column
df = penguins[penguins['Sex'] != '.'] #deleting entries that have "." as entry for Sex since we do not know is it male or female
#df['Sex'].unique()
df2 = df[['Island', 'CulmenLength_mm','CulmenDepth_mm', 'FlipperLength_mm', 'BodyMass_g', 'Sex', 'Delta15N','Delta13C']] #feature selection

df2 = df2.dropna()
df2['Sex'] = pd.get_dummies(df2['Sex'], drop_first=True)
# Print the updated dataframe
df2



Unnamed: 0,Island,CulmenLength_mm,CulmenDepth_mm,FlipperLength_mm,BodyMass_g,Sex,Delta15N,Delta13C
1,Torgersen,39.5,17.4,186.0,3800.0,0,8.94956,-24.69454
2,Torgersen,40.3,18.0,195.0,3250.0,0,8.36821,-25.33302
4,Torgersen,36.7,19.3,193.0,3450.0,0,8.76651,-25.32426
5,Torgersen,39.3,20.6,190.0,3650.0,1,8.66496,-25.29805
6,Torgersen,38.9,17.8,181.0,3625.0,0,9.18718,-25.21799
...,...,...,...,...,...,...,...,...
338,Biscoe,47.2,13.7,214.0,4925.0,0,7.99184,-26.20538
340,Biscoe,46.8,14.3,215.0,4850.0,0,8.41151,-26.13832
341,Biscoe,50.4,15.7,222.0,5750.0,1,8.30166,-26.04117
342,Biscoe,45.2,14.8,212.0,5200.0,0,8.24246,-26.11969


In [5]:
encoder = LabelEncoder() #used to create a label encoder
df2['Island'] = encoder.fit_transform(df2['Island']) #encodes Islands to numbers so they are usable by decision tree
df2


Unnamed: 0,Island,CulmenLength_mm,CulmenDepth_mm,FlipperLength_mm,BodyMass_g,Sex,Delta15N,Delta13C
1,2,39.5,17.4,186.0,3800.0,0,8.94956,-24.69454
2,2,40.3,18.0,195.0,3250.0,0,8.36821,-25.33302
4,2,36.7,19.3,193.0,3450.0,0,8.76651,-25.32426
5,2,39.3,20.6,190.0,3650.0,1,8.66496,-25.29805
6,2,38.9,17.8,181.0,3625.0,0,9.18718,-25.21799
...,...,...,...,...,...,...,...,...
338,0,47.2,13.7,214.0,4925.0,0,7.99184,-26.20538
340,0,46.8,14.3,215.0,4850.0,0,8.41151,-26.13832
341,0,50.4,15.7,222.0,5750.0,1,8.30166,-26.04117
342,0,45.2,14.8,212.0,5200.0,0,8.24246,-26.11969


In [6]:
# Split the data into training and testing sets
X = df2.drop(['Sex'], axis=1)
y = df2['Sex']
train_data = pd.concat([X, y], axis=1).sample(frac=0.8, random_state=42)
test_data = pd.concat([X, y], axis=1).drop(train_data.index)

# Separate the predictor variables and the variable to predict
predictors_train = train_data.drop('Sex', axis=1)
predictors_test = test_data.drop('Sex', axis=1)
target_train = train_data['Sex']
target_test = test_data['Sex']

In [7]:
def gini_impurity(df2):
    p = df2['Sex'].value_counts(normalize=True)
    return 1 - sum(p**2)


In [8]:
def decision_tree(df2):
    # If there is only one class, return that class
    if len(df2['Sex'].unique()) == 1:
        return df2['Sex'].unique()[0]

    # If there are no features left, return the most common class
    if len(df2.columns) == 1:
        return df2['Sex'].value_counts().index[0]

    # Choose the best feature and value combination to split the data
    best_feature, best_value, best_gini = None, None, 1
    for feature in df2.columns[:-1]:
        for value in df2[feature].unique():
            left_data = df2[df2[feature] <= value]
            right_data = df2[df2[feature] > value]
            gini = len(left_data) / len(df2) * gini_impurity(left_data) + len(right_data) / len(df2) * gini_impurity(right_data)
            if gini < best_gini:
                best_feature, best_value, best_gini = feature, value, gini

    # Split the data based on the best feature and value combination
    left_data = df2[df2[best_feature] <= best_value]
    right_data = df2[df2[best_feature] > best_value]

    # Recursively build the left and right subtrees
    left_subtree = decision_tree(left_data)
    right_subtree = decision_tree(right_data)

    # Return the node with the best feature and value combination, and its subtrees
    return {'feature': best_feature, 'value': best_value, 'left': left_subtree, 'right': right_subtree}



In [9]:
# Build the decision tree
tree = decision_tree(train_data)
tree


{'feature': 'BodyMass_g',
 'value': 3700.0,
 'left': {'feature': 'CulmenDepth_mm',
  'value': 18.5,
  'left': {'feature': 'BodyMass_g',
   'value': 3450.0,
   'left': 0,
   'right': {'feature': 'BodyMass_g', 'value': 3475.0, 'left': 1, 'right': 0}},
  'right': {'feature': 'CulmenLength_mm',
   'value': 38.9,
   'left': {'feature': 'CulmenDepth_mm',
    'value': 19.3,
    'left': {'feature': 'Island',
     'value': 0,
     'left': {'feature': 'CulmenLength_mm',
      'value': 37.7,
      'left': 1,
      'right': 0},
     'right': 0},
    'right': 1},
   'right': {'feature': 'BodyMass_g',
    'value': 3525.0,
    'left': {'feature': 'FlipperLength_mm',
     'value': 194.0,
     'left': 0,
     'right': 1},
    'right': 1}}},
 'right': {'feature': 'CulmenDepth_mm',
  'value': 14.8,
  'left': {'feature': 'BodyMass_g', 'value': 5150.0, 'left': 0, 'right': 1},
  'right': {'feature': 'BodyMass_g',
   'value': 3850.0,
   'left': {'feature': 'CulmenLength_mm',
    'value': 47.6,
    'left': {'

In [10]:
clf = DecisionTreeClassifier(random_state=42) #creates clf which is a decision tree model
clf.fit(predictors_train, target_train) #trains the model based on training data

# Make predictions on the test data
preds = clf.predict(predictors_test)

# Calculate the accuracy of the model
accuracy = accuracy_score(target_test, preds)
print('Accuracy:', accuracy)

Accuracy: 0.8769230769230769


In [11]:
from sklearn.ensemble import RandomForestClassifier

clf_forrest = RandomForestClassifier(n_estimators=1000, random_state=30) #creates clf_forrest which is a random forrest model
clf_forrest.fit(predictors_train, target_train) #train the random forrest model 

# Make predictions on the test data
preds_forrest = clf_forrest.predict(predictors_test)

# Calculate the accuracy of the model
accuracy_forrest = accuracy_score(target_test, preds_forrest)
print('Accuracy:', accuracy_forrest)

Accuracy: 0.8769230769230769


In [12]:
def predict(row, decision_tree):
    if decision_tree['is_leaf']:
        return decision_tree['class']
    else:
        attr_value = row[decision_tree['attribute']]
        if attr_value in decision_tree['branches']:
            return predict(row, decision_tree['branches'][attr_value])
        else:
            return decision_tree['default']

decision_tree

<function __main__.decision_tree(df2)>