In [1]:
%%capture
!pip install pymongo pprint dateparser matplotlib pandas sklearn numpy seaborn

In [4]:
import pymongo
import pprint
import dateparser
import pandas as pd
import numpy as np
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report, confusion_matrix
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split

%matplotlib inline

In [5]:
course_cluster_uri = "mongodb://agg-student:agg-password@cluster0-shard-00-00-jxeqq.mongodb.net:27017,cluster0-shard-00-01-jxeqq.mongodb.net:27017,cluster0-shard-00-02-jxeqq.mongodb.net:27017/test?ssl=true&replicaSet=Cluster0-shard-0&authSource=admin"
course_client = pymongo.MongoClient(course_cluster_uri)
titanic = course_client['coursera-agg']['titanic']

In [128]:
titanic.find_one()

{'_id': ObjectId('59f3769387ef3716f7157ac5'),
 'age': 35,
 'cabin': '',
 'class': 3,
 'fare_paid': 8.05,
 'gender': 'male',
 'name': 'Allen, Mr. William Henry',
 'parents_children': 0,
 'passenger_id': 5,
 'point_of_embarkation': 'S',
 'siblings_spouse': 0,
 'survived': 0,
 'ticket_number': 373450}

In [129]:
initial_project = {
    "$project": {
        "_id": 0,
        "name": 0,
        "point_of_embarkation": 0,
        "ticket_number": 0,
        "passenger_id": 0,
        "cabin": 0,
    }
}

In [130]:
# todo - correct the age.
# *HINT* -- If the $type of "$age" is a string, set it to 0
age_correction = {
    "$switch" : {
        "branches" : [
            {
                "case" : { "$eq" : [ {"$type" : "$age"}, "string"]},
                "then" : 0
            }
        ],
        "default" : "$age"
    }
}

In [131]:
# todo - one hot encode gender_female. 1 if female, 0 if male
one_hot_female = {
    "$switch" : {
        "branches" : [
            {
                "case" : { "$eq" : [ "$gender", "female"]},
                "then" : 1
            },
            {
                "case" : { "$eq" : [ "$gender", "male"]},
                "then" : 0
            }
        ]
    }
}

In [132]:
# todo - the inverse of above. 1 if male, 0 if female
one_hot_male = {
    "$switch" : {
        "branches" : [
            {
                "case" : { "$eq" : [ "$gender", "female"]},
                "then" : 0
            },
            {
                "case" : { "$eq" : [ "$gender", "male"]},
                "then" : 1
            }
        ]
    }
}

In [133]:
encoding_stage = {
    "$addFields": {
        "gender_female": one_hot_female,
        "gender_male": one_hot_male,
        "age": age_correction
    }
}

In [134]:
final_project = {
    "$project": {
        "gender": 0
    }
}

In [135]:
pipeline = [initial_project, encoding_stage, final_project]

In [136]:
df = pd.DataFrame.from_dict(list(titanic.aggregate(pipeline)))
df.head(50)

Unnamed: 0,age,class,fare_paid,gender_female,gender_male,parents_children,siblings_spouse,survived
0,35.0,3,8.05,0,1,0,0,0
1,22.0,3,7.25,0,1,0,1,0
2,4.0,3,16.7,1,0,1,1,1
3,27.0,3,11.1333,1,0,2,0,1
4,35.0,1,53.1,1,0,0,1,1
5,20.0,3,8.05,0,1,0,0,0
6,58.0,1,26.55,1,0,0,0,1
7,39.0,3,31.275,0,1,5,1,0
8,55.0,2,16.0,1,0,0,0,1
9,0.0,2,13.0,0,1,0,0,1


In [137]:
X = df.drop('survived', axis=1)

In [138]:
y = df['survived']

In [139]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.15)

In [140]:
dtree = DecisionTreeClassifier()

In [141]:
%%capture
dtree.fit(X_train, y_train)

In [142]:
predictions = dtree.predict(X_test)

In [143]:
print(confusion_matrix(y_test, predictions))
print("\n")
print(classification_report(y_test, predictions))

[[67 16]
 [16 35]]


             precision    recall  f1-score   support

          0       0.81      0.81      0.81        83
          1       0.69      0.69      0.69        51

avg / total       0.76      0.76      0.76       134



In [144]:
rfc = RandomForestClassifier(n_estimators=20)

In [145]:
%%capture
rfc.fit(X_train, y_train)

In [146]:
rfc_pred = rfc.predict(X_test)

In [147]:
print(confusion_matrix(y_test, rfc_pred))
print("\n")
print(classification_report(y_test, rfc_pred, target_names=['test', 'predictions']))

[[74  9]
 [15 36]]


             precision    recall  f1-score   support

       test       0.83      0.89      0.86        83
predictions       0.80      0.71      0.75        51

avg / total       0.82      0.82      0.82       134



In [148]:
iterations = 1000
dtree_avg_accuracy = 0
rfc_avg_accuracy = 0
for _ in range(iterations):
    dtree.fit(X_train, y_train)
    dtree_avg_accuracy += dtree.score(X_test, y_test)
    rfc.fit(X_train, y_train)
    rfc_avg_accuracy += rfc.score(X_test, y_test)
    
print(f"""
After {iterations} iterations:
  Single Decision Tree accuracy: {dtree_avg_accuracy / iterations}
  Random Forest accuracy:        {rfc_avg_accuracy / iterations}
  
  Lab Answer:  dtree={round(dtree_avg_accuracy / iterations, 2)}, rfc={round(rfc_avg_accuracy / iterations, 2)}
""")


After 1000 iterations:
  Single Decision Tree accuracy: 0.7650000000000047
  Random Forest accuracy:        0.821738805970147
  
  Lab Answer:  dtree=0.77, rfc=0.82

