In [30]:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from collections import Counter
import re

%matplotlib inline

In [101]:
df = pd.read_csv("../Data/donut-falls-trail.csv")

In [102]:
print(df.shape)
df

(2030, 4)


Unnamed: 0.1,Unnamed: 0,Date,Tags,Comments
0,0,2019-09-07,['hiking'],Loved this trail! It's short but I went early...
1,1,2019-09-07,['hiking'],Popular trail with lots of people. Great short...
2,2,2019-09-05,['hiking'],"Very Easy hike, even if you have to park on th..."
3,3,2019-09-05,['hiking'],You can either park by the highway or take the...
4,4,2019-09-03,['hiking'],The trail itself was very easy and well shaded...
5,5,2019-09-02,['hiking'],Too many people and the waterfall itself wasn’...
6,6,2019-09-02,['hiking'],We started a little after 7am and it was not t...
7,7,2019-09-02,['hiking'],It was okay! There was a lot of people though!...
8,8,2019-09-02,['hiking'],Easy hike that is perfect for families or for ...
9,9,2019-09-01,['hiking'],Super fun trail. But way way crowded on a Sund...


In [103]:
# Lets remove rows with no comments
df = df[pd.notnull(df['Comments'])]

In [68]:
df.shape

(845, 4)

#### Here is another way to drop rows with NAN, the only difference is that this one removes everything
df = df.dropna()

In [62]:
tags = Counter(df['Tags'])
tags

Counter({"['']": 324,
         "['backpacking']": 3,
         "['bird watching']": 4,
         "['cross country skiing']": 1,
         "['hiking', 'bugs', 'muddy', 'off trail']": 1,
         "['hiking', 'muddy', 'rocky', 'snow']": 5,
         "['hiking', 'muddy', 'snow']": 6,
         "['hiking', 'muddy']": 1,
         "['hiking', 'off trail', 'rocky']": 1,
         "['hiking', 'private property', 'rocky']": 1,
         "['hiking', 'rocky', 'scramble']": 1,
         "['hiking', 'rocky']": 4,
         "['hiking', 'scramble', 'snow']": 3,
         "['hiking', 'scramble']": 2,
         "['hiking', 'snow']": 6,
         "['hiking']": 460,
         "['muddy', 'snow']": 3,
         "['nature trips']": 3,
         "['scenic driving']": 2,
         "['snowshoeing', 'scramble', 'snow']": 1,
         "['snowshoeing']": 7,
         "['walking']": 6})

In [104]:
cf = df
cf

Unnamed: 0.1,Unnamed: 0,Date,Tags,Comments
0,0,2019-09-07,['hiking'],Loved this trail! It's short but I went early...
1,1,2019-09-07,['hiking'],Popular trail with lots of people. Great short...
2,2,2019-09-05,['hiking'],"Very Easy hike, even if you have to park on th..."
3,3,2019-09-05,['hiking'],You can either park by the highway or take the...
4,4,2019-09-03,['hiking'],The trail itself was very easy and well shaded...
5,5,2019-09-02,['hiking'],Too many people and the waterfall itself wasn’...
6,6,2019-09-02,['hiking'],We started a little after 7am and it was not t...
7,7,2019-09-02,['hiking'],It was okay! There was a lot of people though!...
8,8,2019-09-02,['hiking'],Easy hike that is perfect for families or for ...
9,9,2019-09-01,['hiking'],Super fun trail. But way way crowded on a Sund...


In [105]:
# remove the brackets and quatations
def cleanit(df, text_field):
    df[text_field] = df[text_field].str.strip("'[]")
    df[text_field] = df[text_field].str.replace("'","")
    return df

cleanit(cf, 'Tags')

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  This is separate from the ipykernel package so we can avoid doing imports until
A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  after removing the cwd from sys.path.


Unnamed: 0.1,Unnamed: 0,Date,Tags,Comments
0,0,2019-09-07,hiking,Loved this trail! It's short but I went early...
1,1,2019-09-07,hiking,Popular trail with lots of people. Great short...
2,2,2019-09-05,hiking,"Very Easy hike, even if you have to park on th..."
3,3,2019-09-05,hiking,You can either park by the highway or take the...
4,4,2019-09-03,hiking,The trail itself was very easy and well shaded...
5,5,2019-09-02,hiking,Too many people and the waterfall itself wasn’...
6,6,2019-09-02,hiking,We started a little after 7am and it was not t...
7,7,2019-09-02,hiking,It was okay! There was a lot of people though!...
8,8,2019-09-02,hiking,Easy hike that is perfect for families or for ...
9,9,2019-09-01,hiking,Super fun trail. But way way crowded on a Sund...


Now let's expand out classification column into a "multi labeled" series of classes.

In [107]:
df[['class1','class2','class3','class4']] = cf['Tags'].str.split(",", expand=True)

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  self[k1] = value[k2]


So now we have a dataset with multi-label for classification and the comment column. We will be cleaning up the label column in order to prepare it for classification. 

In [112]:
df.head(2)

Unnamed: 0.1,Unnamed: 0,Date,Tags,Comments,class1,class2,class3,class4
0,0,2019-09-07,hiking,Loved this trail! It's short but I went early...,hiking,,,
1,1,2019-09-07,hiking,Popular trail with lots of people. Great short...,hiking,,,


Let's remove the columns that are not going to be use. 

In [113]:
df = df[['Date','Comments','class1','class2', 'class3','class4']]

In [114]:
df.head(2)

Unnamed: 0,Date,Comments,class1,class2,class3,class4
0,2019-09-07,Loved this trail! It's short but I went early...,hiking,,,
1,2019-09-07,Popular trail with lots of people. Great short...,hiking,,,


Now let's add some the Jday column as a predictor. 