In [54]:
import pandas as pd
import numpy as np
import re

from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer

volunteer = pd.read_csv("./data/volunteer_opportunities.csv")
wine = pd.read_csv("./data/wine_types.csv")

X = wine[["Proline", "Total phenols", "Hue", "Nonflavanoid phenols"]]
y = wine["Type"]

hiking = pd.read_json("./data/hiking.json")

running_times_5k = pd.DataFrame({
    "name": ["Sue", "Mark", "Sean", "Erin", "Jenny", "Russell"],
    "run1": [20.1, 16.5, 23.5, 21.7, 25.8, 30.9],
    "run2": [18.5, 17.1, 25.1, 21.1, 27.1, 29.6],
    "run3": [19.6, 16.9, 25.2, 20.9, 26.1, 31.4],
    "run4": [20.3, 17.6, 24.6, 22.1, 26.7, 30.4],
    "run5": [18.3, 17.3, 23.9, 22.2, 26.9, 29.9]})

In [55]:
# Set up the LabelEncoder object
enc = LabelEncoder()

# Apply the encoding to the "Accessible" column
hiking["Accessible_enc"] = enc.fit_transform(hiking["Accessible"])

# Compare the two columns
print(hiking[["Accessible", "Accessible_enc"]].head())

  Accessible  Accessible_enc
0          Y               1
1          N               0
2          N               0
3          N               0
4          N               0


In [56]:
# Transform the category_desc column
category_enc = pd.get_dummies(volunteer["category_desc"])

# Take a look at the encoded columns
print(category_enc.head())

   Education  Emergency Preparedness  Environment  Health  \
0          0                       0            0       0   
1          0                       0            0       0   
2          0                       0            0       0   
3          0                       0            0       0   
4          0                       0            1       0   

   Helping Neighbors in Need  Strengthening Communities  
0                          0                          0  
1                          0                          1  
2                          0                          1  
3                          0                          1  
4                          0                          0  


In [57]:
# Use .loc to create a mean column
running_times_5k["mean"] = running_times_5k.loc[:, "run1":"run5"].mean(axis=1)

# Take a look at the results
print(running_times_5k.head())

    name  run1  run2  run3  run4  run5   mean
0    Sue  20.1  18.5  19.6  20.3  18.3  19.36
1   Mark  16.5  17.1  16.9  17.6  17.3  17.08
2   Sean  23.5  25.1  25.2  24.6  23.9  24.46
3   Erin  21.7  21.1  20.9  22.1  22.2  21.60
4  Jenny  25.8  27.1  26.1  26.7  26.9  26.52


In [58]:
# First, convert string column to date column
volunteer["start_date_converted"] = pd.to_datetime(volunteer["start_date_date"])

# Extract just the month from the converted column
volunteer["start_date_month"] = volunteer["start_date_converted"].dt.month

# Take a look at the converted and new month columns
print(volunteer[["start_date_converted", "start_date_month"]].head())

  start_date_converted  start_date_month
0           2011-07-30                 7
1           2011-02-01                 2
2           2011-01-29                 1
3           2011-02-14                 2
4           2011-02-05                 2


In [59]:
hiking = hiking[['Prop_ID', 'Name', 'Location', 'Park_Name', 'Length', 'Difficulty', 'Other_Details', 'Accessible', 'Limited_Access']]
hiking = hiking.dropna()

In [60]:
# Write a pattern to extract numbers and decimals
def return_mileage(length):
    pattern = re.compile(r"\d+\.\d+")

    # Search the text for matches
    mile = re.match(pattern, length)
    
    # If a value is returned, use group(0) to return the found value
    if mile is not None:
        return float(mile.group(0))
        
# Apply the function to the Length column and take a look at both columns
hiking["Length_num"] = hiking["Length"].apply(lambda row: return_mileage(row))
print(hiking[["Length", "Length_num"]].head())

       Length  Length_num
1    1.0 mile        1.00
2  0.75 miles        0.75
3   0.5 miles        0.50
4   0.5 miles        0.50
5     Various         NaN


In [61]:
# Take the title text
title_text = volunteer["title"]

# Create the vectorizer method
tfidf_vec = TfidfVectorizer()

# Transform the text into tf-idf vectors
text_tfidf = tfidf_vec.fit_transform(title_text)

In [65]:
volunteer = volunteer[["title", "category_desc"]].dropna()
title_text = volunteer["title"]
tfidf_vec = TfidfVectorizer()
text_tfidf = tfidf_vec.fit_transform(title_text)

nb = GaussianNB()

In [66]:
# Split the dataset according to the class distribution of category_desc
y = volunteer["category_desc"]
X_train, X_test, y_train, y_test = train_test_split(text_tfidf.toarray(), y, stratify=y, random_state=42)

# Fit the model to the training data
nb.fit(X_train, y_train)

# Print out the model's accuracy
print(nb.score(X_test, y_test))

0.5161290322580645
