# TO DO:
- How to handle NaN in encoder? (drop before classification)
- Mess with classifier parameters
- test classifier
- how to handle repeats?
- change order of operations so that slave_indexes match unclassified_data...how to do this?


In [1]:
import collections

import numpy as np
import pandas as pd
import exploringShipLogbooks.wordCount as wc

from fuzzywuzzy import fuzz
from sklearn import preprocessing
from sklearn.naive_bayes import MultinomialNB

from exploringShipLogbooks.basic_utils import isolate_training_data
from exploringShipLogbooks.basic_utils import extract_logbook_data
from exploringShipLogbooks.basic_utils import isolate_columns
from exploringShipLogbooks.basic_utils import encode_data_df
from exploringShipLogbooks.basic_utils import clean_data

from exploringShipLogbooks.config import *

# Load and clean data
### Load CLIWOC ship logs

In [2]:
# extract data from zip file
cliwoc_data = extract_logbook_data('CLIWOC15.csv')
#cliwoc_data = cliwoc_data.loc[50000:100000]

  if self.run_code(code, result):


### Find definite slave data in CLIWOC data set
- These logs will be used to test the classifier

In [3]:
# extract logs that mention slaves
mentions_slaves = wc.count_key_words(cliwoc_data, text_columns, slave_words)
slave_index = mentions_slaves[(mentions_slaves['ContainsKeyWord'] != 0)].index
slave_mask = (mentions_slaves['ContainsKeyWord'] != 0)

print('Found ', len(slave_index), ' logs that mention slaves')

# cleanup
del mentions_slaves

Found  464  logs that mention slaves


### Clean CLIWOC data

In [4]:
# remove undesired columns
cliwoc_data = isolate_columns(cliwoc_data, desired_columns)

# clean data (make all same case)
#cliwoc_data = clean_data(cliwoc_data)

### Load Slave Voyages data

In [5]:
file_name = './exploringShipLogbooks/data/tastdb-exp-2010'
slave_voyage_logs = df = pd.read_pickle(file_name)

### Clean Slave voyages data

In [6]:
slave_voyage_desired_cols = ['portdep', 'portret', 'rig', 'national', 'yeardep']
slave_voyage_logs = isolate_columns(slave_voyage_logs, slave_voyage_desired_cols)

slave_voyage_logs.columns = ['Nationality', 'ShipType', 
                             'VoyageFrom', 'VoyageTo', 'Year']

#slave_voyage_logs = clean_data(slave_voyage_logs)

# Join data sets

- Adding three indices to keep track of what dataset is which. We need to use this to index the classifier data later

In [7]:
cliwoc_data_indices_no_slaves = pd.DataFrame(0, index=cliwoc_data[~slave_mask].index, columns = ['data_indices'])
cliwoc_data_indices_slaves = pd.DataFrame(1, index=cliwoc_data[slave_mask].index, columns = ['data_indices'])

cliwoc_data_indices = pd.concat([cliwoc_data_indices_no_slaves, cliwoc_data_indices_slaves]).sort_index()

In [8]:
slave_data_indices = pd.DataFrame(2, index=(slave_voyage_logs.index + cliwoc_data.tail(1).index[0]), columns = ['data_indices'])

- cliwoc_data (no slaves) = 0
- cliwoc_data (slaves)    = 1
- slave_data              = 2

In [9]:
indices = pd.concat([cliwoc_data_indices, slave_data_indices])

- okay, now back to your code. commented out the last lines since the above cells take care of indexing the data

In [10]:
all_data = pd.concat([cliwoc_data, slave_voyage_logs], ignore_index = True)
all_data = clean_data(all_data)

# keep track of slave training data indices
#last_cliwoc_index = len(cliwoc_data)
#new_last_index = len(slave_voyage_logs) + last_cliwoc_index
#training_slave_indices = list(range(last_cliwoc_index, new_last_index))

## further cleaning of data

- Nationality: 
    - british and great britain
    - french and france
    - spanish and spain
    - usa and american
    
- ShipName:
    - 9740 unique shipnames good column to filter!
    
- ShipType:
    - 118743 nan values for ship types (remove these columns?)
    - duplicate types in different languages, will this be a problem?
    
- VoyageFrom:
    - 1129 unique voyage starting points
    
- VoyageTo:
    - 1074 unique voyage ending points
    
- Year:
    - filter out the ships from the slave dataset that are before 1600?

- finds the rows where ShipType is equal to nan

In [None]:
ship_type_ind = ~(all_data['ShipType'] == 'nan')

- finds the empty strings in the ShipName column

In [None]:
empty_rows_ind = ~all_data['ShipName'].isin(all_data['ShipName'].value_counts().keys()[all_data['ShipName'].value_counts().keys() == ''])

- finds the ShipNames greater than some value count. Used 0 so the non-slave ship training data would be available...

In [None]:
ship_name_ind = all_data['ShipName'].isin(all_data['ShipName'].value_counts().keys()[all_data['ShipName'].value_counts()>0])

In [None]:
all_data = all_data[(empty_rows_ind & ship_name_ind & ship_type_ind)]
indices = indices[(empty_rows_ind & ship_name_ind & ship_type_ind)]
indices.index = range(len(indices))

In [None]:
all_data = all_data[ship_name_ind]
indices = indices[ship_name_ind]
indices.index = range(len(indices))

## Test of fuzzywuzzy method

In [None]:
 df = pd.DataFrame( {'id':[1, 2, 3, 4, 5, 6], 'name':['dog', 'cat', 'mad cat', 'good dog', 'bad dog', 'chicken']})

In [None]:
def func(name):
    matches = df.apply(lambda row: (fuzz.partial_ratio(row['name'], name) >= 85), axis=1)
    return [i for i, x in enumerate(matches) if x]

In [None]:
df.apply(lambda row: func(row['name']), axis=1)

## Try fuzzywuzzy on subset of one of our columns

In [None]:
def func(name, column_name):
    matches = all_data[0:10].apply(lambda row: (fuzz.partial_ratio(row[column_name], name) >= 85), axis=1)
    return [i for i, x in enumerate(matches) if x]

In [None]:
column_name = 'Nationality'
all_data[0:10].apply(lambda row: func(row[column_name], column_name), axis=1)

# Encode data
- Must encode data before separating, otherwise values that do not occur in a subset will be encoded differently

In [11]:
all_data = encode_data_df(all_data)

## Test case of combing/removing columns

In [None]:
df = pd.DataFrame( {'id2':[1, 2, 3, 4, 5, 6], 'id1':['dog', 'cat', 'mad cat', 'good dog', 'bad dog', 'chicken'], 'id3':['dog', 'cat', 'mad cat', 'good dog', 'bad dog', 'chicken'], 'id':['dog', 'cat', 'mad cat', 'good dog', 'bad dog', 'chicken']})

In [None]:
df.columns= ['id', 'id', 'id2', 'id3']

In [None]:
df.head()

In [None]:
df = df.drop('id', axis=1)

In [None]:
df

## Remove nan columns from encoding (currently there are no empty cells)

- if a cell were empty, all_data.loc[:,all_data.columns.get_loc('')]

In [12]:
all_data['no_data'] = all_data['nan'].apply(lambda x: x.any(), axis=1).astype(int)

In [13]:
all_data = all_data.drop('nan', axis=1)

In [14]:
all_data.head()

Unnamed: 0,american,argentina,brazil,british,danish,denmark,dutch,france,french,genoa,...,york in virginia,ysla de santa cathalina,ysla de santa cathalinaysla de santa cathalina,zeeland,ziam,zierikzee,zuid afrika,zuiden,Year,no_data
0,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1800,0
1,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1790,0
2,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1790,0
3,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1790,0
4,0,0,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,1790,0


# Extract training data, and create list of classes

In [16]:
# slave_logs training data is from slave voyages data set
training_slave_indices = (indices['data_indices'] == 2)
slave_logs = all_data[training_slave_indices]

criteria = {'ShipName': non_slave_ships}
no_slave_logs, no_slave_mask = isolate_training_data(cliwoc_data, criteria)

# remaining data is unclassified 
unclassified_logs = all_data.drop(all_data.index[training_slave_indices])

# convert to numpy array
unclassified_logs = unclassified_logs.as_matrix()

# clean-up
#del all_data

KeyError: 'ShipName'

In [None]:
no_slave_logs

In [None]:
# create list of classes for training data (0 is for non-slave, 1 is for slave)
# index matches training_data
classes = np.zeros(len(no_slave_logs))
classes = np.append(classes, np.ones(len(slave_logs)))

# joint training data
training_data = pd.concat([no_slave_logs, slave_logs], ignore_index = True)

# convert to numpy array
training_data = training_data.as_matrix()

# cleanup
del no_slave_logs, slave_logs

# Fit training data to classifier
- **note!** first column of numpy array is index! do not include in classification!

In [None]:
classifier = MultinomialNB(alpha = 1.0, class_prior = None, fit_prior = True)
classifier.fit(training_data, classes)

predictions = classifier.predict(unclassified_logs)

In [None]:
predictions_count = collections.Counter(predictions)
print(predictions_count)

# Test classifier
- check if slave logs from cliwoc data were classified correctly (want mostly classified as 1)
- compare first column with slave_index
