# Imports

In [1]:
# basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing
import re
import nltk
from nltk.tokenize import word_tokenize 
from nltk.corpus import stopwords, wordnet
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('punkt')
nltk.download('stopwords')
nltk.download('wordnet')
nltk.download('snowball_data')
nltk.download('averaged_perceptron_tagger')

# sentiment analysis (vaderSentiment)
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
except:
    !pip install vaderSentiment
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# # pycld2
# try:
#   import pycld2 as cld2
# except:
#   !pip install pycld2
#   import pycld2 as cld2

# # deep_translator
# try:
#   from deep_translator import GoogleTranslator
# except:
#   !pip install deep-translator
#   from deep_translator import GoogleTranslator

# sklearn
from sklearn.model_selection import train_test_split, learning_curve
from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer
from sklearn.neighbors import KNeighborsClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report, RocCurveDisplay

from imblearn.over_sampling import SMOTE

[nltk_data] Downloading package punkt to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package punkt is already up-to-date!
[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package snowball_data to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package snowball_data is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


# Load Data

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# %cd '/content/drive/MyDrive/TextMiningProject'

In [None]:
train = pd.read_excel('Data/train.xlsx', index_col='index')
train_reviews = pd.read_excel('Data/train_reviews.xlsx', index_col='index')

# 1. Exploratory Data Analysis

- on whole trainset (maybe train-test split has to be done before exploration)

## 1.1 train.xlsx

In [None]:
# define dataframe variable
dataframe = train.copy()
dataframe = dataframe.reset_index()

# define target feature
target_feature = 'unlisted'

In [None]:
# display head & tail
dataframe

In [None]:
# data types
print('Data Types:', '\n')
dataframe.dtypes

In [None]:
# missing values
print('Missing Values:', '\n')
pd.concat([dataframe.isnull().sum(), dataframe.eq('').sum()], keys=['Nulls','Empty Strings'], axis=1)

In [None]:
# duplicated rows
print('Duplicated Rows:', '\n')
dataframe.duplicated().sum()

In [None]:
# descriptive statistics
print('Descriptive Statistics:', '\n')
dataframe.describe(include='all').T

In [None]:
# check for imbalance
# Count the number of instances for each target value
target_counts = dataframe[target_feature].value_counts()

# Create a pie chart
plt.pie(target_counts, labels=target_counts.index, autopct='%1.1f%%')
plt.axis('equal')
plt.title('Target Imbalance Check')
plt.legend(labels=['listed', 'unlisted'])
plt.show()

## 1.2 train_reviews.xlsx

In [None]:
# define dataframe variable
dataframe = train_reviews.copy()
dataframe = dataframe.reset_index()

In [None]:
# display head & tail
dataframe

In [None]:
# data types
print('Data Types:', '\n')
dataframe.dtypes

In [None]:
# missing values
print('Missing Values:', '\n')
pd.concat([dataframe.isnull().sum(), dataframe.eq('').sum()], keys=['Nulls','Empty Strings'], axis=1)

In [None]:
# duplicated rows
print('Duplicated Rows:', '\n')
dataframe.duplicated().sum()

In [None]:
# descriptive statistics
print('Descriptive Statistics:', '\n')
dataframe.describe(include='all').T

# 2. Preprocessing


## 2.1 Regex Patterns

In [None]:
regex_patterns = {
    # manually identified patterns
    r'_x005f_x000d_':               ' ',
    r'_x000d_':                     ' ',
    r'm2':                          'squaremeter',
    r'sm':                          'squaremeter',
    r'sqm':                         'squaremeter',
    r'm²':                          'squaremeter',
    r'license[ number]+[0-9a-z]+':  ' ',
    r'(\\b\\w)\\1+\\b':             ' ',

    # basic patterns
    # remove url
    r'http\S+':                     ' ',
    # remove html tags
    r'<.*?>':                       ' ',
    # remove punctuation
    r'[^\w\s]':                     ' ',
    # remove numbers
    r'\d+':                         ' ',
    # remove multiple whitespace
    r'\s+':                         ' ',
    # remove newline
    r'\n':                          ' '
}

## 2.2 Preprocessing Functions

In [None]:
def text_preprocessing(dataframe):
  '''
  Preprocessing Pipeline
  Input: dataframe
  Output: transformed dataframe
  '''

  stop_words = set(stopwords.words('english'))
  lemmatizer = nltk.stem.WordNetLemmatizer()
  stemmers = {
    'es': nltk.stem.SnowballStemmer('spanish'),
    'pt': nltk.stem.SnowballStemmer('portuguese'),
    'nl': nltk.stem.SnowballStemmer('dutch'),
    'de': nltk.stem.SnowballStemmer('german'),
    'it': nltk.stem.SnowballStemmer('italian'),
    'da': nltk.stem.SnowballStemmer('danish'),
    'nb': nltk.stem.SnowballStemmer('norwegian'),
    'fi': nltk.stem.SnowballStemmer('finnish'),
    'sv': nltk.stem.SnowballStemmer('swedish')
  }

  # lowercase non-numeric features 
  categorical_features = list(dataframe.select_dtypes(exclude = np.number).columns)
  for col in categorical_features:
    dataframe[col] = dataframe[col].apply(lambda x: x.lower() if type(x) == str else '')
    
  # drop duplicates
  dataframe = dataframe.reset_index()
  dataframe = dataframe.drop_duplicates()

  # fill empty cells
  for col in categorical_features:
    dataframe[col] = dataframe[col].fillna('')

  # re.sub regex patterns (dictionary defined in the cell above)
  for col in categorical_features:
    for key, value in regex_patterns.items():
      dataframe[col] = dataframe[col].apply(lambda x: re.sub(key,value,x))

  # detect language
  for col in categorical_features:
    new_col = col + '_lang'
    dataframe[new_col] = dataframe[col].apply(lambda x: cld2.detect(x)[-1][0][1] if len(x) < 5000 else 'text has more than 5k characters')
  
  # tokenize and remove stopwords
  for col in categorical_features:
    dataframe[col] = dataframe[col].apply(lambda x: [word for word in word_tokenize(x) if word not in stop_words])

  # part-of-speech tagging (english only)
  for col in categorical_features:
    lang_col = col + '_lang'
    dataframe[col] = dataframe.apply(lambda row: nltk.pos_tag(row[col]) if row[lang_col] == 'en' else row[col], axis=1)
    # wordnet mapping
    pos_tag_map = {
    'J': wordnet.ADJ,
    'V': wordnet.VERB,
    'N': wordnet.NOUN,
    'R': wordnet.ADV
    }

    dataframe[col] = dataframe.apply(lambda row: [(word, pos_tag_map.get(tag[0])) for word, tag in row[col]] if row[lang_col] == 'en' else row[col], axis=1)
  
  # lemmatizing (english only)
  for col in categorical_features:
    lang_col = col + '_lang'
    dataframe[col] = dataframe.apply(lambda row: [lemmatizer.lemmatize(word, tag) if tag else word for word, tag in row[col]] if row[lang_col] == 'en' else row[col], axis=1)
  
  # stemming (non-english contained in the stemmers dictionary)
  for col in categorical_features:
    lang_col = col + '_lang'
    dataframe[col] = dataframe.apply(lambda row: [stemmers.get(row[lang_col]).stem(word) if row[lang_col] in stemmers.keys() else word for word in row[col]] if row[lang_col] != 'en' else row[col], axis=1)

  # combine again
  for col in categorical_features:
    dataframe[col] = dataframe[col].apply(lambda x: ' '.join(x))

  return dataframe

## 2.3 Apply Preprocessing

In [None]:
train_preprocessed = train.copy()
train_preprocessed = text_preprocessing(train_preprocessed)

In [None]:
train_reviews_preprocessed = train_reviews.copy()
train_reviews_preprocessed = text_preprocessing(train_reviews_preprocessed)

# 3. Feature Engineering (Sentiment Analysis on Comments)

- so far only sentiment analysis for englisch comments

In [None]:
def sentiment_analysis(dataframe, column='comments', language_column='comments_lang', compound=False):
    '''
    Sentiment analysis using vaderSentiment
    Input: dataframe
    Output: dataframe with sentiment column
    '''
    new_column = column + '_sentiment'
    sia = SentimentIntensityAnalyzer()

    # return sentiment label with highest score
    if compound is False:
      dataframe[new_column] = dataframe[column].apply(lambda x: sia.polarity_scores(x))
      dataframe[new_column] = dataframe[new_column].apply(lambda x: {key:val for key,val in x.items() if key != 'compound'})
      dataframe[new_column] = dataframe[new_column].apply(lambda x: max(x, key=x.get))
      # set empty string comments from 'neg' (negative) to 'neu' (neutral)
      dataframe[new_column] = ['neu' if y == '' else x for x,y in zip(dataframe[new_column], dataframe[column])]
      # exception for comments which are not in english
      dataframe[new_column] = ['neu' if y != 'ENGLISH' else x for x,y in zip(dataframe[new_column], dataframe[language_column])]

    # compound score is a combined score that ranges from -1 to 1
    # higher values indicating more positive sentiment
    elif compound is True:
      dataframe[new_column] = dataframe[column].apply(lambda x: sia.polarity_scores(x)['compound'])
      # exception for comments which are not in english
      dataframe[new_column] = [0 if y != 'ENGLISH' else x for x,y in zip(dataframe[new_column], dataframe[language_column])]

    return dataframe

In [None]:
train_reviews_sentiment = train_reviews_preprocessed.copy()
train_reviews_sentiment = sentiment_analysis(train_reviews_sentiment)

In [None]:
train_reviews_sentiment_compound = train_reviews_preprocessed.copy()
train_reviews_sentiment_compound = sentiment_analysis(train_reviews_sentiment_compound, compound=True)

In [None]:
# train_preprocessed.to_csv('Data/train_preprocessed.csv', index=False)
# train_reviews_sentiment.to_csv('Data/train_reviews_sentiment.csv', index=False)
# train_reviews_sentiment_compound.to_csv('Data/train_reviews_sentiment_compound.csv', index=False)

# 4. Train-Test Split

In [2]:
# read in preprocessed data (so preprocessing doesn't have to be done again)
# keep_default_na=False to prevent empty strings from being read in as NaN

train_preprocessed = pd.read_csv('Data/train_preprocessed.csv', keep_default_na=False)
train_reviews_sentiment = pd.read_csv('Data/train_reviews_sentiment.csv', keep_default_na=False)
# train_reviews_sentiment_compound = pd.read_csv('Data/train_reviews_sentiment_compound.csv', keep_default_na=False)

## 5.1 Combine Text

In [3]:
# combine all text for an airbnb and create one BoW per airbnb

def combine_text(dataframe1=train_preprocessed, dataframe2=train_reviews_sentiment):
    '''
    Combine all text for an airbnb
    Input: dataframe1 (train_preprocessed), dataframe2 (train_reviews_sentiment)
    Output: combined dataframe
    Output format: 'unlisted', 'text'
    '''
    # prepare dataframe1
    # combine description and host_about
    dataframe1['text'] = [x + ' ' + y for x,y in zip(dataframe1['description'], dataframe1['host_about'])]
    # drop description, host_about, description_lang, host_about_lang
    dataframe1 = dataframe1.drop(['description', 'host_about', 'description_lang', 'host_about_lang'], axis=1)

    # prepare dataframe2
    # combine all text comments for an index
    aggregated_comments = pd.DataFrame(dataframe2.groupby('index')['comments'].agg(lambda x: ' '.join(x))).reset_index()
    # combine all sentiment labels for an index
    aggregated_sentiment_label = pd.DataFrame(dataframe2.groupby('index')['comments_sentiment'].agg(lambda x: ' '.join(x))).reset_index()
    # add sentiment_labels to aggregated_comments
    aggregated_comments['comments'] = [x + ' ' + y for x,y in zip(aggregated_comments['comments'], aggregated_sentiment_label['comments_sentiment'])]
    dataframe2 = aggregated_comments

    # merge dataframe1 and dataframe2 on index
    combined = pd.merge(dataframe1, dataframe2, on='index', how='left')
    # fill empty cells for missing comments
    combined = combined.fillna('')
    # combine all text for an airbnb
    combined['text'] = [x + ' ' + y for x,y in zip(combined['text'], combined['comments'])]
    # drop comments
    combined = combined.drop(['comments'], axis=1)
    # set index to 'index' or airbnb id
    combined = combined.set_index('index')

    return combined

In [4]:
train_preprocessed_to_combine, train_reviews_sentiment_to_combine = train_preprocessed.copy(), train_reviews_sentiment.copy()
train_combined = combine_text(train_preprocessed_to_combine, train_reviews_sentiment_to_combine)
train_combined.head(3)

Unnamed: 0_level_0,unlisted,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,share mixed room hostel share bathroom locate ...
2,1,meu espac fic pert de parqu eduard vii saldanh...
3,1,trafaria house cozy familiar villa facility ne...


## 5.2 Train-Test Split

In [5]:
x, y = train_combined['text'], train_combined['unlisted']
xtrain, xval, ytrain, yval = train_test_split(x, y, test_size=0.3, stratify=y, random_state=420)

# 5. Encoding & SMOTE

## 5.2 Bag-of-Words

In [None]:
# initialize bow vectorizer
bow = CountVectorizer()

# transform xtrain and xval
xtrain_bow = bow.fit_transform(xtrain).toarray()
xval_bow = bow.transform(xval)

# SMOTE
# initialize SMOTE
smote = SMOTE()

# fit and resample
xtrain_bow, ytrain_bow = smote.fit_resample(xtrain_bow, ytrain)

## 5.3 TF-IDF

In [6]:
# initialize tfidf vectorizer
word_tfidf = TfidfVectorizer()

# transform xtrain and xval
xtrain_word = word_tfidf.fit_transform(xtrain).toarray()
xval_word = word_tfidf.transform(xval)

# SMOTE
# initialize SMOTE
smote = SMOTE()

# fit and resample
xtrain_word, ytrain_word = smote.fit_resample(xtrain_word, ytrain)

## 5.4 Word Embeddings

# 6. Modelling

## 6.1 Modelling Functions

In [None]:
def multiple_roc_auc(classifiers, xtrain, xval, ytrain, yval):
    '''
    Plots ROC/AUC curves for multiple classifiers.
    Input: classifiers (dict), xtrain, ytrain, xval, yval
    Output: ROC/AUC curves plot
    '''

    fig, ax = plt.subplots(1, figsize=(15, 10))
    for name, clf in classifiers.items():
        clf.fit(xtrain, ytrain)
        RocCurveDisplay.from_estimator(clf, xval, yval, ax=ax, name=name)
    ax.set_title('Receiver Operating Characteristic (ROC)')
    ax.plot([0,1], [0,1], linestyle='--')
    return plt.show()

def learning_curves(estimator, x, y, cv=10, scoring='f1'):
    '''
    Plots learning curve for different training set sizes
    Input: estimator, x, y, (optional: cv, scoring)
    Output: learning curve plot
    '''

    train_sizes, train_scores, validation_scores = learning_curve(estimator, x, y, cv=cv, scoring=scoring, train_sizes=np.arange(.05,1,.05))
    train_mean, test_mean, train_std, test_std = np.mean(train_scores, axis=1), np.mean(validation_scores, axis=1), np.std(train_scores, axis=1), np.std(validation_scores, axis=1)

    plt.subplots(1, figsize=(10,10))
    plt.plot(train_sizes, train_mean, color='salmon',  label='Training score', marker = 'o')
    plt.plot(train_sizes, test_mean, color='olive', label='Cross-validation score', marker = 's')
    plt.title('Learning Curve')
    plt.xlabel('Training Set Size')
    plt.ylabel(f'{scoring.upper()} Score')
    plt.legend(loc='best')
    return plt.show()

## 6.2 Baseline Model

In [None]:
# # classifiers to check
# classifiers = {
#                 'K-Nearest Neighbors': KNeighborsClassifier(),
#                 'Logistic Regression': LogisticRegression(),
#                 'Random Forest': RandomForestClassifier()
#                 }

# multiple_roc_auc(classifiers, xtrain_word, xval_word, ytrain, yval)

### 6.2.1 Logistic Regression

In [None]:
# Bag-of-Words

logreg = LogisticRegression(n_jobs=-1)
# fit model on training data
logreg.fit(xtrain_bow, ytrain_bow)
# make predictions on validation data
ypred = logreg.predict(xval_bow)
print(classification_report(yval, ypred))

In [None]:
# TF-IDF

logreg = LogisticRegression(n_jobs=-1)
# fit model on training data
logreg.fit(xtrain_word, ytrain_word)
# make predictions on validation data
ypred = logreg.predict(xval_word)
print(classification_report(yval, ypred))

### 6.2.2 RandomForest

In [None]:
# Bag-of-Words

rf = RandomForestClassifier(n_jobs=-1)
# fit model on training data
rf.fit(xtrain_bow, ytrain_bow)
# make predictions on validation data
ypred = rf.predict(xval_bow)
print(classification_report(yval, ypred))

In [7]:
# TF-IDF

rf = RandomForestClassifier(n_jobs=-1)
# fit model on training data
rf.fit(xtrain_word, ytrain_word)
# make predictions on validation data
ypred = rf.predict(xval_word)
# print classification report
print(classification_report(yval, ypred))

              precision    recall  f1-score   support

           0       0.93      0.89      0.91      2710
           1       0.75      0.83      0.79      1039

    accuracy                           0.88      3749
   macro avg       0.84      0.86      0.85      3749
weighted avg       0.88      0.88      0.88      3749



## 6.3 Neural Networks

### 6.3.1 Recurrent Neural Networks

### 6.3.2 LSTM

## 6.4 Transfer Learning