<a href="https://colab.research.google.com/github/eduartheinen/foursquare-tips/blob/master/foursquare_tips.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pip setuptools spacy wheel xgboost
# !python -m spacy download pt_core_news_sm # comment this line after first run

In [2]:
import re
import string
import spacy
import pandas as pd
import numpy as np

from tqdm import tqdm
from sklearn.feature_extraction.text import CountVectorizer, TfidfVectorizer
from sklearn.feature_selection import chi2, SelectKBest
from sklearn.decomposition import TruncatedSVD
from sklearn.model_selection import KFold

#Data Preprocessing

In [22]:
class FoursquareTipsDataset:
    def __init__(self, df, ngram_range=None):
        # extracting lemmas and POS tags with spacy even though we are not using them yet
        self.sentences, self.lemmas, self.pos = self.preprocess(df.texto)
        self.labels = df.rotulo

        # bag of words
        self.count_vectorizer = CountVectorizer(ngram_range=ngram_range)
        self.bow = self.count_vectorizer.fit_transform(self.sentences)

        # tfidf
        self.tfidf_vectorizer = TfidfVectorizer(ngram_range=ngram_range)
        self.tfidf = self.tfidf_vectorizer.fit_transform(self.sentences)

        # for easy indexing 
        self.sentences = pd.DataFrame(self.sentences)
        self.lemmas = pd.DataFrame(self.lemmas)
        self.pos = pd.DataFrame(self.pos)
        
        # wouldnt be possible with a full dataset
        # TODO: implement smart indexing with getitem, x, and y methods
        # self.bow = pd.DataFrame(self.bow.toarray())
        # self.tfidf = pd.DataFrame(self.tfidf.toarray())

    @staticmethod
    def preprocess(reviews):
        sentences = []
        lemmas = []
        pos = []

        for sentence in tqdm(reviews):
            sentence = re.sub(r'http\S+', '', sentence)  # removes urls before punctuation
            punctuation_to_space = str.maketrans(string.punctuation, ' ' * len(string.punctuation))
            sentence = sentence.translate(punctuation_to_space)  # change punctuations to spaces
            sentence = str.lower(sentence)
            sentence = re.sub(' +', ' ', sentence)  # removes double spaces
            sentence = re.sub('\d+', '', sentence)  # removes numbers
            
            # spacy processing -- nlp(sentence) -- adds properties to words, 
            # like "lemma_", "pos_" and "is_stop" for stop_words.
            sentence = list(filter(lambda w: not w.is_stop, nlp(sentence)))
            lemmas.append([w.lemma_ for w in sentence if not w.is_stop])
            pos.append([w.pos_ for w in sentence if not w.is_stop])
            
            # sklearn count/tfidf vectorizers require raw text
            sentences.append(' '.join([w.text for w in sentence]))

        return sentences, lemmas, pos

    # returns train_x, train_y, test_x and test_y folds, according to cross validation indexes
    def get_folds(self, train_index, test_index, feat=None):
      if type(train_index) == int:
        if feat == 'tfidf':
          return self.tfidf.iloc[:train_index], self.labels.iloc[:train_index], self.tfidf.iloc[test_index:], self.labels.iloc[test_index:]
        return self.bow.iloc[:train_index], self.labels.iloc[:train_index], self.bow.iloc[test_index:], self.labels.iloc[test_index:]
      
      else:
        if feat == 'tfidf':
          return self.tfidf.iloc[train_index], self.labels.iloc[train_index], self.tfidf.iloc[test_index], self.labels.iloc[test_index]
        return self.bow.iloc[train_index], self.labels.iloc[train_index], self.bow.iloc[test_index], self.labels.iloc[test_index]


    # TODO: implement smart indexing at __getitem__
    def __getitem__(self, i):
      return self.bow.iloc[i], self.tfidf.iloc[i], self.labels.iloc[i]
    
    def x_bow(self, i):
      return self.bow.iloc[i]
    
    def x_tfidf(self, i):
      return self.tfidf.iloc[i]
    
    def y(self, i):
      return self.labels.iloc[i]

    def __len__(self):
        return len(self.sentences)

### Load and Process Dataset

In [23]:
nlp = spacy.load("pt_core_news_sm")
path = 'https://raw.githubusercontent.com/eduartheinen/foursquare-tips/master/data/'
df = pd.read_csv(path + 'tips_scenario1_train.csv').dropna(how='any')
data = FoursquareTipsDataset(df, ngram_range=(1, 2))

100%|██████████| 1708/1708 [00:14<00:00, 115.87it/s]


### New Possibilities

In [5]:
# Repositório de Word-Embeddings em Português do NILC-ICMC-USP http://www.nilc.icmc.usp.br/embeddings
# embeddings_path = "http://143.107.183.175:22980/download.php?file=embeddings/glove/glove_s100.zip"

# Bert treinado em Português
# https://github.com/neuralmind-ai/portuguese-bert

#Feature Engineering

In [None]:
feature_names = data.count_vectorizer.get_feature_names()
support = 1
best = None

for k in range(100, 4000, 100):
  ch2 = SelectKBest(chi2, k=k)
  ch2.fit(data.bow, data.labels)
  new_support = sum([ch2.scores_[i] for i in ch2.get_support(indices=True)])
  print(f'support with {k} best features: {new_support} -- growth rate: {new_support/support}')
  if new_support > support:
    support = new_support
    best = ch2

# selected_features = {feature_names[i]: ch2.scores_[i] for i in ch2.get_support(indices=True)}
# sorted(selected_features.items(), key=lambda x: x[1], reverse=True)


In [None]:
for c in range(100, 2000, 100):
  svd = TruncatedSVD(n_components=c, n_iter=10)
  svd.fit(data.tfidf)
  print(f'{c} components -- explained variance {svd.explained_variance_ratio_.sum()}')
  if svd.explained_variance_ratio_.sum() > 0.98:
    break

# Naive Bayes

In [51]:
from sklearn.naive_bayes import GaussianNB

kf = KFold(n_splits=10)
kf.get_n_splits(data.bow)

from sklearn.naive_bayes import GaussianNB

kf = KFold(n_splits=10)
kf.get_n_splits(data.bow)

scores = []
for train_index, test_index in kf.split(data.bow):
  x_train, y_train, x_test, y_test = data.get_folds(train_index, test_index)
  
  gnb = GaussianNB()
  gnb.fit(x_train, y_train)
  scores.append(gnb.score(x_test, y_test))

print(f'Gaussian Naive-Bayes mean score: {np.mean(scores)}')

Gaussian Naive-Bayes mean score through 10 k-fold: 0.6820674234606123


#Logistic Regression

In [26]:
from sklearn.linear_model import LogisticRegressionCV

train_index = test_index = int(len(data)*0.75) # testing with 0.25 of dataset
x_train, y_train, x_test, y_test = data.get_folds(train_index, test_index)

lr_cv = LogisticRegressionCV(cv=3, Cs=20, random_state=0, n_jobs=-1).fit(x_train, y_train)
lr_cv.fit(x_train, y_train)#.predict(x_test)
print(f'CV Logistic Regression mean score: {lr_cv.score(x_test, y_test)}')

integer index.


[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.4min finished
[Parallel(n_jobs=-1)]: Using backend LokyBackend with 2 concurrent workers.
[Parallel(n_jobs=-1)]: Done   3 out of   3 | elapsed:  2.4min finished


CV Logistic Regression: 0.7049180327868853


#Support Vector Classification

In [37]:
from sklearn.svm import LinearSVC

train_index = test_index = int(len(data)*0.75) # testing with 0.25 of dataset
x_train, y_train, x_test, y_test = data.get_folds(train_index, test_index)

scv = LinearSVC(C=1.0)
scv.fit(x_train,y_train)
print(f"SVC mean score {scv.score(x_test, y_test)}")

integer index.
SVC mean score 0.7259953161592506


#XGBoost

In [57]:
from xgboost import XGBClassifier
from sklearn.model_selection import cross_val_score

train_index = test_index = int(len(data)*0.75) # testing with 0.25 of dataset
x_train, y_train, x_test, y_test = data.get_folds(train_index, test_index)


xgb = XGBClassifier()
scores = cross_val_score(xgb, x_train, y_train)
print(f"XGB mean score {np.mean(scores)}")

# xgb.fit(x_train, y_train, verbose=2)
# xgb.feature_importances_

integer index.


AttributeError: ignored

#Bi-directional LSTM

In [73]:
# tmp = pd.DataFrame(xgb.feature_importances_)
# tmp.sort_values(ascending=False, by=0).index

AttributeError: ignored

#Fine Tunned BERT