In [None]:
%matplotlib inline
import numpy as np
from ggplot import *
import pandas as pd
from langdetect import detect
from sklearn.feature_extraction.text import CountVectorizer, TfidfTransformer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.decomposition import PCA

from sklearn.cluster import KMeans
from sklearn.pipeline import Pipeline
from sklearn.linear_model import LinearRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import r2_score, mean_squared_error
from nltk.stem.porter import PorterStemmer

import nltk
import matplotlib.pyplot as plt
from matplotlib import cm
import seaborn as sns

In [None]:
def detect_lang(comment):
    
    language = "No found"
    try:
        language = detect(comment)
    except:
        print "could not find language"
    return language

In [None]:
def add_language_to_reviews(df):
    df_reviews = pd.read_csv('data/reviews.csv')
    
    df_reviews["comments2"] = df_reviews.comments.apply(lambda x:str(x).decode('utf-8'))
    df_reviews['language']= df_reviews.comments2.apply(detect_lang)
    df_reviews.to_csv("data/reviews_with_lang.csv")
    

In [None]:
df_reviews = pd.read_csv('data/reviews_with_lang.csv')
df_reviews_eng = df_reviews[df_reviews.language=='en']

df_listing = pd.read_csv('data/listings.csv')


In [None]:
df_listing.columns

In [None]:
df_listing[df_listing.room_type=='Entire home/apt'].plot(kind='scatter',x='availability_30',y='number_of_reviews')

In [None]:
df_listing.room_type.unique()

In [None]:
df_reviews_eng = pd.merge(df_reviews_eng,df_listing[['id','review_scores_rating','review_scores_cleanliness','room_type','availability_30','availability_90','reviews_per_month']],left_on='listing_id',right_on='id')

In [None]:
df_reviews_eng.head()

In [None]:
groupy_per_lang = df_reviews.groupby('language').comments.count().sort_values(ascending = False)

In [None]:
groupy_per_lang.plot.bar(title="Per review language")

In [None]:
def concat_comments(x):
    return "%s" % '- '.join(x)

df_reviews_eng = df_reviews_eng[df_reviews_eng.room_type=='Entire home/apt']
df_reviews_eng = df_reviews_eng[~df_reviews_eng.review_scores_rating.isnull()]
df_reviews_eng = df_reviews_eng[~df_reviews_eng.comments.str.contains('The host canceled this reservation')]
groupy = df_reviews_eng.groupby('listing_id').agg({'comments': [concat_comments,'count'], 'review_scores_rating': 'mean', 'availability_30':'mean','availability_90':'mean','reviews_per_month':'mean'})
#groupy  = df_rewiews_eng.groupby('listing_id')['comments','review_scores_cleanliness'].apply(lambda x: "%s" % ', '.join(x))
raw_text = groupy.comments.values
groupy.columns = ["_".join(x) for x in groupy.columns.ravel()]

groupy.head()

In [None]:
ggplot(groupy,aes(x='availability_90_mean',y='reviews_per_month_mean'))+geom_point()

In [None]:
porter = PorterStemmer()

def tokenizer_porter(text):
    return [porter.stem(word) for word in text.split()]

nltk.download('stopwords')
from nltk.corpus import stopwords
stop = stopwords.words('english')


def get_vectorized_text(X,ngram_range=(1,1),max_features=None):
    pipeline = Pipeline([('vect', TfidfVectorizer(ngram_range=ngram_range,tokenizer=tokenizer_porter,stop_words=stop,max_features=max_features)),
                         ('tfidf', TfidfTransformer())]) 
    return pipeline.fit_transform(X).todense()


vectorizer = TfidfVectorizer(ngram_range=(2,2),tokenizer=tokenizer_porter,stop_words=stop,max_features=3000)
X_vectorized = vectorizer.fit_transform(raw_text).todense()

In [None]:
pca = PCA(n_components=20).fit(X_vectorized)

print pca.explained_variance_ratio_.sum()
print pca.explained_variance_ratio_*100

In [None]:
data2D = pca.transform(X_vectorized)
groupy['pc1']=data2D[:,0]
groupy['pc2']=data2D[:,1]

In [None]:
cmap = cm.get_cmap('Spectral')
data2D = pca.transform(X_vectorized)
groupy.plot(kind='scatter',x='pc1', y='pc2',c='review_scores_rating',alpha=.2)

In [None]:
df_reviews_eng.review_scores_rating.hist(bins=100)

In [None]:
groupy['score_cat']=groupy.review_scores_rating.astype(str)
groupy['dirty']= groupy.comments.str.contains('dirty')*1.0

In [None]:
ggplot(groupy, aes(x='pc1', y='pc2', color='dirty')) +\
    geom_point() +\
    xlab("pc1") + ylab("pc2") + ggtitle("Review Score")

In [None]:
count_vectorizer = TfidfVectorizer(ngram_range=(2,2),tokenizer=tokenizer_porter,stop_words=stop,max_features=20000)
counts = count_vectorizer.fit_transform(raw_text)

In [None]:
print raw_text.shape
print counts.shape
print groupy.shape

### Add PCA 

In [None]:
pca = PCA(n_components=200).fit(counts.todense())

print pca.explained_variance_ratio_.sum()
print pca.explained_variance_ratio_*100
PC_matrix = pca.transform(X_vectorized)

In [None]:
type(PC_matrix)

In [None]:
pc_columns = ["pc"+str(i) for i in range(1,201)]
df_PCA = pd.DataFrame(PC_matrix, columns=pc_columns)
df_PCA.head()

In [None]:
groupy_with_PCA = pd.concat([groupy.reset_index(),df_PCA],axis=1)

In [None]:
groupy.shape

In [None]:
df_PCA.shape

In [None]:
groupy_with_PCA

In [None]:
groupy_with_PCA = groupy_with_PCA[groupy_with_PCA.availability_30_mean<30]
lm = LinearRegression()
rf = RandomForestClassifier()
X_train,X_test,y_train,y_test = train_test_split(groupy_with_PCA[pc_columns+['review_scores_rating_mean','comments_count','reviews_per_month_mean']].values,groupy_with_PCA.availability_30_mean,test_size=0.2)
rf.fit(X_train,y_train)

In [None]:
prediction= rf.predict(X_test)

In [None]:
print "R2:{}".format(r2_score(prediction,y_test))
print "RMSE :{}".format(np.sqrt(mean_squared_error(prediction,y_test)))

In [None]:
df_prediction = pd.DataFrame({'observed':y_test,'predicted':prediction})

In [None]:
ggplot(df_prediction, aes(x='observed', y='predicted')) +\
    geom_jitter() +\
    xlab("availability_30") + ylab("prediction") + ggtitle("Review Score")