In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import os
import checkpoint2
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
from textblob import TextBlob
from IPython.display import HTML
from sklearn.model_selection import train_test_split
from textblob.classifiers import NaiveBayesClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, KFold

RANDOM_STATE = 42

In [2]:
if not os.path.exists("uber_preprocessed.csv") or not os.path.exists("lyft_preprocessed.csv"):
    uber = pd.read_csv("uber_cleaned.csv").dropna().astype({'Invited': 'int32', "Unfair": 'int32'})
    lyft = pd.read_csv("lyft_cleaned.csv").astype({'Invited': 'int32'})
    uber['# Reviews By User'] = pd.to_numeric(uber['# Reviews By User'], errors='coerce')
    lyft["# Reviews By User"] = pd.to_numeric(lyft['# Reviews By User'], errors='coerce')
    uber = uber.dropna(subset=['# Reviews By User'])
    lyft = lyft.dropna(subset=['# Reviews By User'])
    raw_uber = uber.copy()
    raw_lyft = lyft.copy()


    lyft["Review Body"] = checkpoint2.preprocess_reviews(lyft)
    uber["Review Body"] = checkpoint2.preprocess_reviews(uber)
    uber.to_csv("uber_preprocessed.csv", index=False)
    lyft.to_csv("lyft_preprocessed.csv", index=False)
else:
    raw_uber = pd.read_csv("uber_cleaned.csv").dropna().astype({'Invited': 'int32', "Unfair": 'int32'})
    raw_lyft = pd.read_csv("lyft_cleaned.csv").astype({'Invited': 'int32'})
    raw_uber['# Reviews By User'] = pd.to_numeric(raw_uber['# Reviews By User'], errors='coerce')
    raw_lyft["# Reviews By User"] = pd.to_numeric(raw_lyft['# Reviews By User'], errors='coerce')
    raw_uber = raw_uber.dropna(subset=['# Reviews By User'])
    raw_lyft = raw_lyft.dropna(subset=['# Reviews By User'])


    uber = pd.read_csv("uber_preprocessed.csv")
    lyft = pd.read_csv("lyft_preprocessed.csv")

ALREADY_PREPROCESSED = True

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

sentiment = checkpoint2.get_weighed_sentiment_counts(checkpoint2.get_sentiment_and_counts(uber))
class SentimentTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        if not ALREADY_PREPROCESSED: X["Review Body"] = checkpoint2.preprocess_reviews(X)
        X["Review Body"] = [str(review) for review in X["Review Body"]]
        X['Sentiment'] = [sum([sentiment[word] if word in sentiment else 0 for word in str(review).split()]) for review in X["Review Body"]]
        return X


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

multinomial_transformer = ColumnTransformer([
    ('title_vect', TfidfVectorizer(max_df=0.5, max_features=18, min_df=0, stop_words='english', sublinear_tf=True, use_idf=False), 'Review Title'), 
    ('body_vect', TfidfVectorizer(max_df=0.5, max_features=144, min_df=0, stop_words=None, sublinear_tf=True, use_idf=True), 'Review Body')
])

multinomial_classifier = Pipeline([
    ('selector', multinomial_transformer),
    ('clf', MultinomialNB())
])
labeled = uber[uber.Unfair.notnull()]
X_labeled = labeled.loc[:, labeled.columns != 'Unfair']
y_labeled = labeled['Unfair']

# labeled.head()
# tmp = multinomial_transformer.fit_transform(X_labeled, y_labeled)
# checkpoint2.validate_model(multinomial_classifier, 5, X_labeled, y_labeled)

In [5]:
# clf = GridSearchCV(multinomial_classifier, {'selector__title_vect__use_idf': [False], 'selector__body_vect__use_idf': [True], 
#                                             'selector__body_vect__max_features': [144], 
#                                             'selector__title_vect__max_features': [18],
#                                             'selector__body_vect__max_df': [0.5],
#                                             'selector__body_vect__min_df': [0.0],
#                                             'selector__title_vect__max_df': [0.5],
#                                             'selector__title_vect__min_df': [0.0],
#                                             'selector__title_vect__stop_words': ['english', None],
#                                             'selector__body_vect__stop_words': ['english', None],
#                                             'selector__title_vect__sublinear_tf': [True, False],
#                                             'selector__body_vect__sublinear_tf': [True, False],
#                                             'clf__alpha': [6.0]}
#                                             , n_jobs=-1, refit=True).fit(X_labeled, y_labeled)
from pprint import pprint
checkpoint2.validate_model(multinomial_classifier, 5, X_labeled, y_labeled)

array([0.89917355, 0.59052632, 0.73715957, 0.64825451, 0.89917355])

In [6]:
from sklearn.compose import make_column_selector, make_column_transformer

numerical_transformer = Pipeline([
    ('sentiment', SentimentTransformer()),
    ('selector', make_column_transformer((MinMaxScaler(), make_column_selector(dtype_include=np.number))))
])

svm_classifier = Pipeline([
    ('selector', numerical_transformer),
    ('clf', SVC(C=1.0, kernel='poly', degree=4, gamma='auto', coef0=0.0, probability=True, class_weight=None, random_state=RANDOM_STATE))
])

def perform_grid_search(model, params, splits=5, jobs=3, X=X_labeled):
    clf = GridSearchCV(model, param_grid=params, refit=True, cv=splits, n_jobs=jobs, verbose=3).fit(X, y_labeled)
    pprint(clf.best_params_)
    pprint(clf.best_score_)
    # pprint(checkpoint2.validate_model(model, splits, X, y_labeled))

# clf = GridSearchCV(SVC(), {
#     'C': np.linspace(0, 1.5, 10),
#     'kernel': ['poly'],
#     'degree': [4, 5],
#     'gamma': ['auto', 'scale'],
#     'coef0': [0.0],
#     'probability': [True],
#     'class_weight': [None, 'balanced'],
#     'random_state': [RANDOM_STATE]
# }, n_jobs=3, refit=True).fit(numerical_transformer.fit_transform(X_labeled), y_labeled)

# checkpoint2.validate_model(svm_classifier, 5, X_labeled, y_labeled)

In [7]:
dt_classifier = Pipeline([
    ('selector', numerical_transformer),
    ('clf', DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=0.325, max_features='sqrt', random_state=RANDOM_STATE, class_weight=None))
])

# perform_grid_search(DecisionTreeClassifier(), {
#     'criterion': ['gini', 'entropy'],
#     'splitter': ['best'],
#     'min_samples_split': np.linspace(0.1, 0.4, 5),
#     'max_features': ['sqrt'],
#     'random_state': [RANDOM_STATE],
#     'class_weight': [None, 'balanced'],
# }, 5, 3, X=numerical_transformer.fit_transform(X_labeled))

In [27]:
from sklearn.cluster import BisectingKMeans
from sklearn.pipeline import FeatureUnion

both = pd.concat([uber, lyft]).iloc[:, uber.columns != 'Unfair']

pipe = FeatureUnion([
    ('numerical', numerical_transformer),
    ('multinomial', multinomial_transformer)
])
X_numerical = pipe.fit_transform(both)
distorsions = []
# rn = range(80, 120)
# for k in rn:
#     kmeans = BisectingKMeans(n_clusters=k, random_state=RANDOM_STATE)
#     kmeans.fit(X_numerical)
#     distorsions.append(kmeans.inertia_)

# fig = plt.figure(figsize=(15, 5))
# plt.plot(rn, distorsions)
# plt.grid(True)
# plt.title('Elbow curve')

kmeans = BisectingKMeans(n_clusters=10, random_state=RANDOM_STATE)
clusters = pd.Series(kmeans.fit_predict(X_numerical), name='Cluster')

# Use this to combine clusters + raw reviews so it's easier to look at
combined = pd.concat([raw_uber, raw_lyft])
combined = pd.concat([combined.reset_index(drop=True), clusters], axis=1)

In [30]:
# Use this to get a random sample of the rows.
# (Change the 1 to a different cluster number 0 <= n < num_clusters. Limit is the amount of rows to get at once.)
# Note that lyft does not have any labeled Unfair data. Sentiment is included in the clustering, but it's inconvient to get it to display here so I ignored it
from tabulate import tabulate
cols = ["# Reviews By User", "Invited", "Rating", "Review Body", "Unfair", "Company", "Cluster"]
limit = 50
cluster_num = 0
data = combined[combined.Cluster == cluster_num][cols]
HTML(tabulate(data.sample(min(limit, data.shape[0])), headers=cols, tablefmt='html'))


# combined[(combined.Cluster == cluster_num) & (combined.Company.str.contains('Lyft'))].shape

Unnamed: 0,# Reviews By User,Invited,Rating,Review Body,Unfair,Company,Cluster
501,2,1,5,great driver that got me to my destination!,0,Uber,0
250,1,1,3,"driver was polite, vehicle was clean, ride uneventful. however, i was disappointed that my driver did not exit the vehicle and help load/unload my bags. this was a first for me.",1,Uber,0
397,1,1,5,driver was very nice and her driving was brilliant 10 out of 10,0,Uber,0
128,3,1,5,excellent drivers in clean cars. always on hand just minutes away.,0,Uber,0
523,1,1,5,"it’s all the drivers , they are absolutely great. they are always friendly, you can chat with them, i’ve never had an issue it’s always a great ride . thank you ☺️",0,Uber,0
194,10,1,5,"driver martyn was the best, wonderful conversation,",0,Uber,0
478,4,1,5,have used uber a few times now and in different countries. up until then was always a fan of a particular cab company. since the pandemic though it's proven difficult to get a cab from anything other than uber. there's always been a driver available so far and not having to deal with cash a bonus. can't fault it,0,Uber,0
270,2,1,5,"the best taxi service to use. on time , no delay and responsible driver's.",0,Uber,0
419,1,1,5,"my first ride and it was very good. the driver francisco, was pleasant and friendly person. he was on time and it was a safe ride to destination. i will certainly use uber in the near future. great alternative to taxi and the cost of the ride was reasonable.",0,Uber,0
497,2,1,5,the driver was on time and got me to where i was going safely and on time.,0,Uber,0
