In [74]:
import generate_mnb_models as modeling

In [75]:
import pickle
import pandas as pd
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
from sklearn.cross_validation import train_test_split
from airbnb.airbnbneighborhood import AirBnBNeighborhood
from airbnb.airbnblisting import AirBnBListing

In [76]:
MAX_DF_LIST = [.5, .66, .75, .83, .90, 1.0]
MAX_FEATURE_LIST = [500, 1000, 2000, 3000, 4000, 5000, 8000]
MIN_DF_LIST = [1,2]
RANDOM_STATE_LIST = [1, 42, 1337]    # lulz
TRAIT_LIST = ['artsy', 'shopping', 'dining', 'nightlife']

In [77]:
df = modeling.load_data()

In [78]:
df.head()

Unnamed: 0,_id,address,description_clean,description_raw,headline,listing_name,neighborhood,num_saved,price,price_currency,city,traits
0,6751766,"Bush St, San Francisco",i am gone m f am to pm and potentially for the...,\nInteraction with Guests\nI'm gone M-F 6AM to...,"Apartment in San Francisco, United States. Pri...",Lower Nob Hill Private Room,Downtown,Saved 72 times,115,USD,san-francisco,"[Shopping, Great Transit, Touristy, Nightlife,..."
1,1281167,"Post Street, San Francisco (Union Square)",availability can be very volatile i try to kee...,\nThe Space\nAvailability can be very volatile...,"Apartment in San Francisco, United States. The...",Cool Timeshare/Hotel (Union Sq.),Downtown,Saved 248 times,175,USD,san-francisco,"[Shopping, Great Transit, Touristy, Nightlife,..."
2,2478558,"Post Street, San Francisco (Union Square)",as an owner i will transfer my reservation int...,"\nThe Space\nAs an Owner, I will transfer my r...","Apartment in San Francisco, United States. The...","Donatello hotel room, Union Square",Downtown,Saved 8 times,155,USD,san-francisco,"[Shopping, Great Transit, Touristy, Nightlife,..."
3,2227475,"Hyde Street, San Francisco (Downtown)",this building is a a registered historic landm...,\nThe Space\nThis building is a a registered h...,"Apartment in San Francisco, United States. Cla...",Classic San Francisco,Downtown,Saved 470 times,199,USD,san-francisco,"[Shopping, Great Transit, Touristy, Nightlife,..."
4,546633,"Hyde St, San Francisco (Downtown)",cheerful and bright studio in edwardian buildi...,\nThe Space\nCheerful and bright studio in Edw...,"Apartment in San Francisco, United States. Che...",Sunny Cheery Lower Nob Hill Studio,Downtown,Saved 465 times,115,USD,san-francisco,"[Shopping, Great Transit, Touristy, Nightlife,..."


In [79]:
feature_df = modeling.add_features(df)

In [80]:
feature_df.head()

Unnamed: 0,_id,address,description_clean,description_raw,headline,listing_name,neighborhood,num_saved,price,price_currency,city,traits,artsy,shopping,dining,nightlife
0,6751766,"Bush St, San Francisco",i am gone m f am to pm and potentially for the...,\nInteraction with Guests\nI'm gone M-F 6AM to...,"Apartment in San Francisco, United States. Pri...",Lower Nob Hill Private Room,Downtown,Saved 72 times,115,USD,san-francisco,"[Shopping, Great Transit, Touristy, Nightlife,...",False,True,True,True
1,1281167,"Post Street, San Francisco (Union Square)",availability can be very volatile i try to kee...,\nThe Space\nAvailability can be very volatile...,"Apartment in San Francisco, United States. The...",Cool Timeshare/Hotel (Union Sq.),Downtown,Saved 248 times,175,USD,san-francisco,"[Shopping, Great Transit, Touristy, Nightlife,...",False,True,True,True
2,2478558,"Post Street, San Francisco (Union Square)",as an owner i will transfer my reservation int...,"\nThe Space\nAs an Owner, I will transfer my r...","Apartment in San Francisco, United States. The...","Donatello hotel room, Union Square",Downtown,Saved 8 times,155,USD,san-francisco,"[Shopping, Great Transit, Touristy, Nightlife,...",False,True,True,True
3,2227475,"Hyde Street, San Francisco (Downtown)",this building is a a registered historic landm...,\nThe Space\nThis building is a a registered h...,"Apartment in San Francisco, United States. Cla...",Classic San Francisco,Downtown,Saved 470 times,199,USD,san-francisco,"[Shopping, Great Transit, Touristy, Nightlife,...",False,True,True,True
4,546633,"Hyde St, San Francisco (Downtown)",cheerful and bright studio in edwardian buildi...,\nThe Space\nCheerful and bright studio in Edw...,"Apartment in San Francisco, United States. Che...",Sunny Cheery Lower Nob Hill Studio,Downtown,Saved 465 times,115,USD,san-francisco,"[Shopping, Great Transit, Touristy, Nightlife,...",False,True,True,True


In [81]:
X_doc = list(feature_df['description_clean'])
X_doc[:5]

[u'i am gone m f am to pm and potentially for the evening depending on events for the day happy to give advice or hang out in the evening blocks from a cable car pickup bus stops are near and the bart is miles most places are walkable have never waited more than two minutes for uber or lyft',
 u'availability can be very volatile i try to keep the calendar updated as best as i can only two weeks or less in advance if you see availability please submit a reservation request quickly just show up and check in like any other hotel no arranging to get keys necessary description discover this modern hotel the donatello inspired by the renowned renaissance painter who shares its name located in the heart of san francisco s fashionable shopping and theater district near numerous attractions the union square sf hotel boasts a charming italian atmosphere superior amenities and attentive staff with square feet of space and foot ceilings these rooms are some of the largest in the san francisco area

In [82]:
mnb = MultinomialNB()

In [83]:
def run_mnb(X_doc, y, mnb, max_df, max_features, min_df, random_state):
    # split the data
    X_train_doc, X_test_doc, y_train, y_test = train_test_split(X_doc, y, random_state=random_state)
    
    # Vectorize the training data
    tfidf = TfidfVectorizer(max_df=max_df, max_features=max_features, min_df=min_df)
    vectorized_corpus = tfidf.fit_transform(X_train_doc)
    X_train = vectorized_corpus.toarray()
    
    #fit the Naive Bayes model
    mnb.fit(X_train, y_train)
    train_score = mnb.score(X_train, y_train)

    #score it against the test data
    X_test = tfidf.transform(X_test_doc).toarray()
    test_score = mnb.score(X_test, y_test)
    
    return (max_df, max_features, min_df, random_state, train_score, test_score)

In [84]:
def run_grid_search(X_doc, y, mnb, trait):
        
    results = []

    for max_df in MAX_DF_LIST:
        for max_features in MAX_FEATURE_LIST:
            for min_df in MIN_DF_LIST:
                for random_state in RANDOM_STATE_LIST:
                    result = run_mnb(X_doc, y, mnb, max_df, max_features, min_df, random_state)
                    print result
                    results.append(result)

    results_df = pd.DataFrame(results, columns=['max_df', 'max_features', 'min_df', 'random_state', 'train_score', 'test_score'])    
    results_df.to_csv('../models/%s_results.csv' % trait) 
    
    results_df = results_df.groupby(by=['max_df', 'max_features', 'min_df', 'random_state'], as_index=False).mean()
    
    max_test_score = max(results_df['test_score'])
    max_test_results_df = results_df[results_df['test_score']==max_test_score]

    return max_test_results_df.reset_index()

In [85]:
def tiebreaker(mnb, max_test_results_df):
    tiebreaker_df = max_test_results_df.copy()
    tiebreaker_df['final_score'] = 0
    for i in tiebreaker_df.index:
        max_df = tiebreaker_df['max_df'][i]
        max_features = tiebreaker_df['max_features'][i]
        min_df = tiebreaker_df['min_df'][i]
        random_state = tiebreaker_df['random_state'][i]
        
        tfidf=TfidfVectorizer(max_df=max_df, max_features=max_features, min_df=min_df)
        vectorized_corpus = tfidf.fit_transform(X_doc)
        X = vectorized_corpus.toarray()
        
        mnb.fit(X, y)
        tiebreaker_df['full_score'] = mnb.score(X, y)
    
    tiebreaker_df.sort('full_score', inplace=True)
    
    return tiebreaker_df.reset_index()

In [86]:
def run_winning_model(X_doc, y, max_df, max_features, min_df, mnb, trait):
    print 
    print "THE WINNING MODEL IS FOR %s IS: %s, %s, %s" % (trait, max_df, max_features, min_df)
    
    tfidf = TfidfVectorizer(max_df=max_df, max_features=max_features, min_df=min_df)
    vectorized_corpus = tfidf.fit_transform(X_doc)
    
    tfidf_pickle_file = open('../models/tfidf_%s.pkl' % trait, 'w')
    pickle.dump(tfidf, tfidf_pickle_file)
    tfidf_pickle_file.close()
    
    X = vectorized_corpus.toarray()

    mnb.fit(X, y)
    mnb_pickle_file = open('../models/mnb_%s_final.pkl' % trait, 'w')
    pickle.dump(mnb, mnb_pickle_file)
    mnb_pickle_file.close()

In [87]:
for trait in TRAIT_LIST:
    y = feature_df[trait]
    max_test_results_df = run_grid_search(X_doc, y, mnb, trait)
    
    if len(max_test_results_df) > 1:
        max_test_results_df = tiebreaker(mnb, max_test_results_df)
    
    max_df = max_test_results_df['max_df'][0]
    max_features = max_test_results_df['max_features'][0]
    min_df = max_test_results_df['min_df'][0]

    run_winning_model(X_doc=X_doc, y=y, max_df=max_df, max_features=max_features, min_df=min_df, mnb=mnb, trait=trait)

(0.5, 500, 1, 1, 0.73151260504201676, 0.71788413098236781)
(0.5, 500, 1, 42, 0.73529411764705888, 0.7229219143576826)
(0.5, 500, 1, 1337, 0.73529411764705888, 0.66246851385390426)
(0.5, 500, 2, 1, 0.73151260504201676, 0.71788413098236781)
(0.5, 500, 2, 42, 0.73529411764705888, 0.7229219143576826)
(0.5, 500, 2, 1337, 0.73529411764705888, 0.66246851385390426)
(0.5, 1000, 1, 1, 0.79747899159663871, 0.76322418136020154)
(0.5, 1000, 1, 42, 0.7978991596638656, 0.76196473551637278)
(0.5, 1000, 1, 1337, 0.79621848739495793, 0.70906801007556675)
(0.5, 1000, 2, 1, 0.79747899159663871, 0.76448362720403018)
(0.5, 1000, 2, 42, 0.79915966386554627, 0.76070528967254403)
(0.5, 1000, 2, 1337, 0.79411764705882348, 0.70906801007556675)
(0.5, 2000, 1, 1, 0.82436974789915962, 0.76700251889168769)
(0.5, 2000, 1, 42, 0.81932773109243695, 0.77959697732997479)
(0.5, 2000, 1, 1337, 0.82689075630252096, 0.71158690176322414)
(0.5, 2000, 2, 1, 0.82478991596638651, 0.76952141057934509)
(0.5, 2000, 2, 42, 0.82058823