In [1]:
import warnings
warnings.filterwarnings('ignore')

In [2]:
import pandas as pd
import numpy as np
import seaborn as sns
import matplotlib.pyplot as plt

In [3]:
path = 'yelp_data/health_text_sentiment.csv'

df = pd.read_csv(path)
df.drop(['Unnamed: 0'],axis=1,inplace=True)

data = df[['stars','text','clean_text']]

In [4]:
data.head()

Unnamed: 0,stars,text,clean_text
0,1,Please stay away from this place if you can! I...,please stay away place bad care imaginable sta...
1,5,My husband has been a patient of Dr. Byrne for...,husband patient dr byrne last year half last m...
2,4,Dr. Byrne is a great doctor! She has great bed...,dr byrne great doctor great bed side manner ex...
3,3,I'm raising my review as Dr Bryne's has been m...,raise review dr bryne receptive daughter go an...
4,1,I wish I could give 0 stars. Worst office I've...,wish could give star bad office ever horrible ...


In [None]:
data.info()

In [None]:
category_to_id = dict(category_id_df.values)


## Select only 1 & 5 stars

In [None]:
data = data.ix[np.where((data.stars==1)|(data.stars==5))]
data.stars.replace(1,0,inplace=True)
data.stars.replace(5,1,inplace=True)
data.head()

# Multinomial NB


In [None]:
data.head()

In [None]:
test = data.loc[:100]

In [None]:
test.head()

In [None]:
%%time
from sklearn.feature_extraction.text import TfidfVectorizer

tfidf = TfidfVectorizer(sublinear_tf=True, min_df=5, norm='l2', encoding='latin-1', ngram_range=(1, 2), stop_words='english')

# features = tfidf.fit_transform(test.text).toarray()
# labels = test.stars

features = tfidf.fit_transform(data.text).toarray()
labels = data.stars

In [None]:
len(features)

In [None]:
len(labels)

In [None]:
#https://github.com/susanli2016/Machine-Learning-with-Python/blob/master/Consumer_complaints.ipynb

In [None]:
# %%time
# from sklearn.model_selection import train_test_split
# from sklearn.feature_extraction.text import CountVectorizer
# from sklearn.feature_extraction.text import TfidfTransformer
# from sklearn.naive_bayes import MultinomialNB

# X_train, X_test, y_train, y_test = train_test_split(test['text'], test['stars'], random_state = 0)
# count_vect = CountVectorizer()
# X_train_counts = count_vect.fit_transform(X_train)
# tfidf_transformer = TfidfTransformer()
# X_train_tfidf = tfidf_transformer.fit_transform(X_train_counts)

# clf = MultinomialNB().fit(X_train_tfidf, y_train)

In [None]:
%%time
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.svm import LinearSVC

from sklearn.model_selection import cross_val_score


models = [
    RandomForestClassifier(n_estimators=200, max_depth=3, random_state=0),
    LinearSVC(),
    MultinomialNB(),
    LogisticRegression(random_state=0),
]

CV = 5
cv_df = pd.DataFrame(index=range(CV * len(models)))
entries = []
for model in models:
    model_name = model.__class__.__name__
    accuracies = cross_val_score(model, features, labels, scoring='accuracy', cv=CV)
    for fold_idx, accuracy in enumerate(accuracies):
        entries.append((model_name, fold_idx, accuracy))
    
cv_df = pd.DataFrame(entries, columns=['model_name', 'fold_idx', 'accuracy'])

In [None]:
import seaborn as sns

sns.boxplot(x='model_name', y='accuracy', data=cv_df)
sns.stripplot(x='model_name', y='accuracy', data=cv_df, 
              size=8, jitter=True, edgecolor="gray", linewidth=2)
plt.show()

In [None]:
cv_df.info()

In [None]:
cv_df.head()

In [None]:
cv_df.to_csv('yelp_data/health_text_model_results.csv')

In [None]:
cv_df.groupby('model_name').accuracy.mean()

In [None]:
cv_df.info()

In [None]:
cv_models = cv_df.copy()

In [None]:
cv_models

In [None]:
cv_models.fold_idx.replace(4,5,inplace=True)

In [None]:
cv_models.fold_idx.replace(3,4,inplace=True)

In [None]:
cv_models.fold_idx.replace(2,3,inplace=True)

In [None]:
cv_models.fold_idx.replace(1,2,inplace=True)

In [None]:
cv_models.fold_idx.replace(0,1,inplace=True)

In [None]:
cv_models

In [None]:
cv_models.fold_idx.replace(0,1,inplace=True)
cv_models.fold_idx.replace(1,2,inplace=True)
cv_models.fold_idx.replace(2,3,inplace=True)
cv_models.fold_idx.replace(3,4,inplace=True)
cv_models.fold_idx.replace(4,5,inplace=True)

In [None]:
cv_models

In [None]:
data.stars.replace(1,0,inplace=True)
data.stars.replace(5,1,inplace=True)

In [None]:
cv_df.groupby('model_name').mean().plot()

In [None]:
fmri.head()

In [None]:
>>> import seaborn as sns; sns.set()
>>> import matplotlib.pyplot as plt
>>> fmri = sns.load_dataset("fmri")
>>> ax = sns.lineplot(x="timepoint", y="signal", data=fmri)

In [None]:
>>> ax = sns.lineplot(x="timepoint", y="signal", hue="event",
...                   data=fmri)

In [None]:
>>> ax = sns.lineplot(x="fold_idx", y="accuracy", hue="model_name",
...                   data=cv_models)

In [None]:
new = cv_models.loc[5:]

In [None]:
>>> ax = sns.lineplot(x="fold_idx", y="accuracy", hue="model_name",
...                   data=new)

In [None]:
from sklearn.model_selection import train_test_split

model = LinearSVC()

X_train, X_test, y_train, y_test, indices_train, indices_test = train_test_split(features, labels, df.index, test_size=0.33, random_state=0)
model.fit(X_train, y_train)
y_pred = model.predict(X_test)