In [1]:
%load_ext autoreload
%autoreload 2
import pandas as pd
import os
import checkpoint2
import matplotlib.pyplot as plt
import numpy as np
from sklearn.svm import SVC
from textblob import TextBlob
from IPython.display import HTML
from sklearn.model_selection import train_test_split
from textblob.classifiers import NaiveBayesClassifier
from sklearn.cluster import KMeans
from sklearn.preprocessing import MinMaxScaler
from sklearn.tree import DecisionTreeClassifier
from sklearn.neural_network import MLPClassifier
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import GridSearchCV, KFold

RANDOM_STATE = 42

In [2]:
uber = pd.read_csv("uber_cleaned.csv").dropna().astype({'Invited': 'int32', "Unfair": 'int32'})
lyft = pd.read_csv("lyft_cleaned.csv").astype({'Invited': 'int32'})
uber['# Reviews By User'] = pd.to_numeric(uber['# Reviews By User'], errors='coerce')
lyft["# Reviews By User"] = pd.to_numeric(lyft['# Reviews By User'], errors='coerce')
uber = uber.dropna(subset=['# Reviews By User'])
lyft = lyft.dropna(subset=['# Reviews By User'])

In [3]:
from sklearn.base import BaseEstimator, TransformerMixin

sentiment = checkpoint2.get_weighed_sentiment_counts(checkpoint2.get_sentiment_and_counts(uber))
class SentimentTransformer(BaseEstimator, TransformerMixin):
    def fit(self, X, y=None): return self
    def transform(self, X):
        X = X.copy()
        X["Review Body"] = checkpoint2.preprocess_reviews(X)
        X['Sentiment'] = [sum([sentiment[word] if word in sentiment else 0 for word in str(review).split()]) for review in X["Review Body"]]
        return X


In [4]:
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.compose import ColumnTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline

multinomial_transformer = ColumnTransformer([
    ('title_vect', TfidfVectorizer(max_df=0.5, max_features=18, min_df=0, stop_words='english', sublinear_tf=True, use_idf=False), 'Review Title'), 
    ('body_vect', TfidfVectorizer(max_df=0.5, max_features=144, min_df=0, stop_words=None, sublinear_tf=True, use_idf=True), 'Review Body')
])

multinomial_classifier = Pipeline([
    ('selector', multinomial_transformer),
    ('clf', MultinomialNB())
])
labeled = uber[uber.Unfair.notnull()]
X_labeled = labeled.loc[:, labeled.columns != 'Unfair']
y_labeled = labeled['Unfair']

# labeled.head()
# tmp = multinomial_transformer.fit_transform(X_labeled, y_labeled)
# checkpoint2.validate_model(multinomial_classifier, 5, X_labeled, y_labeled)

In [5]:
# clf = GridSearchCV(multinomial_classifier, {'selector__title_vect__use_idf': [False], 'selector__body_vect__use_idf': [True], 
#                                             'selector__body_vect__max_features': [144], 
#                                             'selector__title_vect__max_features': [18],
#                                             'selector__body_vect__max_df': [0.5],
#                                             'selector__body_vect__min_df': [0.0],
#                                             'selector__title_vect__max_df': [0.5],
#                                             'selector__title_vect__min_df': [0.0],
#                                             'selector__title_vect__stop_words': ['english', None],
#                                             'selector__body_vect__stop_words': ['english', None],
#                                             'selector__title_vect__sublinear_tf': [True, False],
#                                             'selector__body_vect__sublinear_tf': [True, False],
#                                             'clf__alpha': [6.0]}
#                                             , n_jobs=-1, refit=True).fit(X_labeled, y_labeled)

from pprint import pprint
checkpoint2.validate_model(multinomial_classifier, 5, X_labeled, y_labeled)

array([0.90082645, 0.60052632, 0.73309942, 0.65092732, 0.90082645])

In [6]:
from sklearn.compose import make_column_selector, make_column_transformer

numerical_transformer = Pipeline([
    ('sentiment', SentimentTransformer()),
    ('selector', make_column_transformer((MinMaxScaler(), make_column_selector(dtype_include=np.number))))
])

svm_classifier = Pipeline([
    ('selector', numerical_transformer),
    ('clf', SVC(C=1.0, kernel='poly', degree=4, gamma='auto', coef0=0.0, probability=True, class_weight=None, random_state=RANDOM_STATE))
])

def perform_grid_search(model, params, splits=5, jobs=3, X=X_labeled):
    clf = GridSearchCV(model, param_grid=params, refit=True, cv=splits, n_jobs=jobs, verbose=3).fit(X, y_labeled)
    pprint(clf.best_params_)
    pprint(clf.best_score_)
    # pprint(checkpoint2.validate_model(model, splits, X, y_labeled))

# clf = GridSearchCV(SVC(), {
#     'C': np.linspace(0, 1.5, 10),
#     'kernel': ['poly'],
#     'degree': [4, 5],
#     'gamma': ['auto', 'scale'],
#     'coef0': [0.0],
#     'probability': [True],
#     'class_weight': [None, 'balanced'],
#     'random_state': [RANDOM_STATE]
# }, n_jobs=3, refit=True).fit(numerical_transformer.fit_transform(X_labeled), y_labeled)

# checkpoint2.validate_model(svm_classifier, 5, X_labeled, y_labeled)

In [7]:
dt_classifier = Pipeline([
    ('selector', numerical_transformer),
    ('clf', DecisionTreeClassifier(criterion='gini', splitter='best', min_samples_split=0.325, max_features='sqrt', random_state=RANDOM_STATE, class_weight=None))
])

# perform_grid_search(DecisionTreeClassifier(), {
#     'criterion': ['gini', 'entropy'],
#     'splitter': ['best'],
#     'min_samples_split': np.linspace(0.1, 0.4, 5),
#     'max_features': ['sqrt'],
#     'random_state': [RANDOM_STATE],
#     'class_weight': [None, 'balanced'],
# }, 5, 3, X=numerical_transformer.fit_transform(X_labeled))

In [8]:
# pd.concat([uber, lyft]).columns

(2699, 14)
(2699, 14)


In [11]:
distorsions = []
both = pd.concat([uber, lyft]).iloc[:, uber.columns != 'Unfair']
X_numerical = numerical_transformer.fit_transform(both)
# for k in range(2, 20):
#     kmeans = KMeans(n_clusters=k, random_state=RANDOM_STATE, n_init='auto')
#     kmeans.fit(X_numerical)
#     distorsions.append(kmeans.inertia_)
#     print("done with ", k)

# fig = plt.figure(figsize=(15, 5))
# plt.plot(range(2, 20), distorsions)
# plt.grid(True)
# plt.title('Elbow curve')

kmeans = KMeans(n_clusters=5, max_iter=1000, random_state=RANDOM_STATE, n_init='auto').fit_predict(X_numerical)

In [None]:
print(kmeans.shape, X_labeled.shape)

AttributeError: 'KMeans' object has no attribute 'shape'