In [1]:
import pandas as pd
import numpy as np
import sklearn
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB, GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn import cross_validation

In [2]:
# Training data
df_addresses = pd.read_csv('./data/train/bayeast_addresses.csv')
df_zips = pd.read_csv('./data/train/bayeast_zips.csv')
df_listing_agent_emails = pd.read_csv('./data/train/bayeast_listing_agent_emails.csv')

In [3]:
# Classify the training data
df_addresses['class'] = 1 # rs_loc_full_address, address
df_zips['class'] = 2 # rs_loc_postal_code, zipcode
df_listing_agent_emails['class'] = 3 # rs_listing_agent_email, email

In [4]:
# Merge the datasets together
df = df_addresses.append(df_zips)
df = df.append(df_listing_agent_emails)

In [5]:
# Data cleansing (sleuthing?)
df = df[df.value != '\\N']

In [6]:
# Tokenize the value class
count_vect = CountVectorizer()
count_vect.fit(df['value'])
X = count_vect.transform(df['value'])
y = df['class']

In [7]:
# Train the model
# clf = MultinomialNB()
clf = GaussianNB()
X = X.toarray()

X_train, X_test, y_train, y_test = cross_validation.train_test_split(X, y, test_size=0.2)

clf.fit(X_train, y_train)
confidence = clf.score(X_test, y_test)
print(confidence)

0.982437352752


In [8]:
# self check
predictions = clf.predict(X)
print(sum(predictions == df['class'])/ len(df.index)) 

0.996358495416


In [9]:
# Import Test data
df_test_addresses = pd.read_csv('./data/test/mlslistings_addresses.csv')
df_test_zips = pd.read_csv('./data/test/mlslistings_zips.csv')
df_test_listing_agent_emails = pd.read_csv('./data/test/mlslistings_listing_agent_emails.csv')

In [10]:
# Manually classify
df_test_addresses['class'] = 1
df_test_zips['class'] = 2
df_test_listing_agent_emails['class'] = 3

In [11]:
# Combine the test data
df_test = df_test_addresses.append(df_test_zips)
df_test = df_test.append(df_test_listing_agent_emails)

In [12]:
# Data cleansing (sleuthing?)
df_test = df_test[df_test.value != '\\N']

In [13]:
# predictions = clf.predict(count_vect.transform(df_test['value']))
predictions = clf.predict(count_vect.transform(df_test['value']).toarray())
print(sum(predictions == df_test['class'])/ len(df_test.index)) 

0.983539862646


In [14]:
# Manual Testing
# clf.predict(count_vect.transform(['48331']))
clf.predict(count_vect.transform(['thenastyone@gmail.com']).toarray())

array([3])