In [1]:
%matplotlib inline
import numpy as np
import re
from ggplot import *
import pandas as pd
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error
from nltk.stem.porter import PorterStemmer
from nltk.corpus import stopwords
import nltk
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns
from sklearn.ensemble import RandomForestRegressor


You can access Timestamp as pandas.Timestamp
  pd.tslib.Timestamp,
  from pandas.lib import Timestamp
  from pandas.core import datetools


In [2]:
def get_unique(x):    
    return type(x.values[1]),x.unique()

def rstr(df):
    return df.apply(get_unique)

In [3]:
df_listing = pd.read_csv('data/listing_cleansed.csv')

cols_text =[col for col in df_listing.columns if 'text' in col]
cols_of_interest = ['id','review_scores_rating','availability_90']+cols_text
df_listing = df_listing[cols_of_interest]

In [4]:
df_missing_data = pd.DataFrame(df_listing.isnull().sum()/len(df_listing)*100).sort_values(by=0,ascending=False)
df_missing_data.columns = ['missing_percent']
df_missing_data[df_missing_data.missing_percent>0]

Unnamed: 0,missing_percent
text_notes,68.798871
text_access,54.919635
text_interaction,54.887055
text_neighborhood_overview,48.859687
text_host_about,46.43788
text_house_rules,40.20417
text_transit,39.400521
text_space,30.701564
review_scores_rating,17.463076
text_summary,5.668983


In [5]:
df_listing = df_listing[(~df_listing.text_name.isnull())& (~df_listing.review_scores_rating.isnull())]
df_listing[cols_text] = df_listing[cols_text].astype(str)
df_listing['text_all'] =df_listing[cols_text].apply(lambda x: ' '.join(x), axis=1)
df_listing['text_all'] =df_listing.text_all.str.replace(r'[^a-zA-Z\d\s:]','')

In [6]:
df_listing = df_listing[df_listing.text_all.str.len()>3]


In [7]:
stringo = df_listing.text_all.tail(-1).values
print stringo

[ 'Bed and Garden nan HistoricalVilla romant Home garden with barbecue children welcome four persons  In our halftimbered villa in PankowNiederschnhausen we offer in historic ambiance a tasteful and stylish apartment for your stay in Berlin On two floors you can live sleep and cook The apartment has its own garden entrance the garden is available for our guests Barbecue and seating available Families with children are welcome there is a playground in the garden  Holiday homes equipment is unique and high quality:  The interior is in color and style of furniture lovingly adapted to the building Underfloor heating and WLan make your stay more comfortable HistoricalVilla romant Home garden with barbecue children welcome four persons  In our halftimbered villa in PankowNiederschnhausen we offer in historic ambiance a tasteful and stylish apartment for your stay in Berlin On two floors you can live sleep and cook The apartment has its own garden entrance the garden is available for our gues

In [8]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

nltk.download('stopwords')

stop = stopwords.words('english')

raw_text = df_listing.text_all.values

vectorizer = TfidfVectorizer(ngram_range=(1,1),tokenizer=tokenizer_porter,stop_words=stop,max_features=20000)
X_vectorized = vectorizer.fit_transform(raw_text)

[nltk_data] Downloading package stopwords to /home/ramon/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!


In [9]:
freqs = [(word, X_vectorized.getcol(idx).sum()) for word, idx in vectorizer.vocabulary_.items()]
#sort from largest to smallest
pd.DataFrame(sorted (freqs, key = lambda x: -x[1])).head()

Unnamed: 0,0,1
0,,492.168884
1,apart,416.013535
2,und,409.968159
3,berlin,329.086283
4,room,264.318124


In [10]:
idf = vectorizer.idf_
df_tfidf = pd.DataFrame.from_dict(dict(zip(vectorizer.get_feature_names(), idf)),orient='index')
df_tfidf.columns=['TFIDF']
df_tfidf.sort_values(by='TFIDF',ascending=False)

Unnamed: 0,TFIDF
restaurantsclub,9.238537
botschaft,9.238537
yrckstrass,9.238537
scheunenviertelspandau,9.238537
mbitss,9.238537
u8m10s1225,9.238537
rosenthalor,9.238537
brotsorten,9.238537
rosenthalerplatzcent,9.238537
restand,9.238537


In [11]:
X_vectorized.shape

(7567, 20000)

In [12]:
X_train,X_test,y_train,y_test = train_test_split(X_vectorized,df_listing.availability_90.values,test_size=.2)


In [13]:
lm = LinearRegression()
lm.fit(X_train,y_train)
prediction = lm.predict(X_test)
r2_score(y_test,prediction)

-3.1148104409921302

In [14]:
rf = RandomForestRegressor()
rf.fit(X_train,y_train)
prediction = rf.predict(X_test)
r2_score(y_test,prediction)

-0.029003008559893129