# Imports

In [1]:
# basics
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns

# preprocessing
import re
import nltk
from nltk.corpus import stopwords
from nltk.stem import SnowballStemmer
from nltk.stem.wordnet import WordNetLemmatizer
nltk.download('stopwords')
nltk.download('wordnet')

# sentiment analysis (vaderSentiment)
try:
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer
except:
    !pip install vaderSentiment
    from vaderSentiment.vaderSentiment import SentimentIntensityAnalyzer

# # pycld2
# try:
#   import pycld2 as cld2
# except:
#   !pip install pycld2
#   import pycld2 as cld2

# # deep_translator
# try:
#   from deep_translator import GoogleTranslator
# except:
#   !pip install deep-translator
#   from deep_translator import GoogleTranslator

# sklearn
from sklearn.model_selection import train_test_split
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.metrics import classification_report

[nltk_data] Downloading package stopwords to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     C:\Users\lukas\AppData\Roaming\nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


# Load Data

In [None]:
# from google.colab import drive
# drive.mount('/content/drive')

# %cd '/content/drive/MyDrive/TextMiningProject'

In [None]:
train = pd.read_excel('Data/train.xlsx', index_col='index')
train_reviews = pd.read_excel('Data/train_reviews.xlsx', index_col='index')

# 1. Exploratory Data Analysis

- on whole trainset (maybe train-test split has to be done before exploration)

## 1.1 train.xlsx

In [None]:
# define dataframe variable
dataframe = train.copy()
dataframe = dataframe.reset_index()

# define target feature
target_feature = 'unlisted'

In [None]:
# display head & tail
dataframe

In [None]:
# data types
print('Data Types:', '\n')
dataframe.dtypes

In [None]:
# missing values
print('Missing Values:', '\n')
pd.concat([dataframe.isnull().sum(), dataframe.eq('').sum()], keys=['Nulls','Empty Strings'], axis=1)

In [None]:
# duplicated rows
print('Duplicated Rows:', '\n')
dataframe.duplicated().sum()

In [None]:
# descriptive statistics
print('Descriptive Statistics:', '\n')
dataframe.describe(include='all').T

In [None]:
# check for imbalance
# Count the number of instances for each target value
target_counts = dataframe[target_feature].value_counts()

# Create a pie chart
plt.pie(target_counts, labels=target_counts.index, autopct='%1.1f%%')
plt.axis('equal')
plt.title('Target Imbalance Check')
plt.legend(labels=['listed', 'unlisted'])
plt.show()

## 1.2 train_reviews.xlsx

In [None]:
# define dataframe variable
dataframe = train_reviews.copy()
dataframe = dataframe.reset_index()

In [None]:
# display head & tail
dataframe

In [None]:
# data types
print('Data Types:', '\n')
dataframe.dtypes

In [None]:
# missing values
print('Missing Values:', '\n')
pd.concat([dataframe.isnull().sum(), dataframe.eq('').sum()], keys=['Nulls','Empty Strings'], axis=1)

In [None]:
# duplicated rows
print('Duplicated Rows:', '\n')
dataframe.duplicated().sum()

In [None]:
# descriptive statistics
print('Descriptive Statistics:', '\n')
dataframe.describe(include='all').T

# 2. Preprocessing

- lemmatizing based on detected language

## 2.1 Regex Patterns

In [None]:
regex_patterns = {
    # manually identified patterns
    r'_x005f_x000d_':               ' ',
    r'_x000d_':                     ' ',
    r'm2':                          'squaremeter',
    r'sm':                          'squaremeter',
    r'sqm':                         'squaremeter',
    r'm²':                          'squaremeter',
    r'license[ number]+[0-9a-z]+':  ' ',
    r'(\\b\\w)\\1+\\b':             ' ',

    # basic patterns
    # remove url
    r'http\S+':                     ' ',
    # remove html tags
    r'<.*?>':                       ' ',
    # remove punctuation
    r'[^\w\s]':                     ' ',
    # remove numbers
    r'\d+':                         ' ',
    # remove multiple whitespace
    r'\s+':                         ' ',
    # remove newline
    r'\n':                          ' '
}

## 2.2 Preprocessing Functions

In [None]:
def text_preprocessing(dataframe):
  '''
  Preprocessing Pipeline
  Input: dataframe
  Output: transformed dataframe
  '''

  stop = set(stopwords.words('english'))
  lemma = WordNetLemmatizer()

  # lowercase non-numeric features 
  categorical_features = list(dataframe.select_dtypes(exclude = np.number).columns)
  for col in categorical_features:
      dataframe[col] = dataframe[col].apply(lambda x: x.lower() if type(x) == str else '')
    
  # drop duplicates
  dataframe = dataframe.reset_index()
  dataframe = dataframe.drop_duplicates()

  # fill empty cells
  for col in categorical_features:
    dataframe[col] = dataframe[col].fillna('')

  # re.sub regex patterns (dictionary defined in the cell above)
  for col in categorical_features:
    for key, value in regex_patterns.items(): 
          dataframe[col] = dataframe[col].apply(lambda x: re.sub(key,value,x))

  # remove stopwords and lemmatize words
  for col in categorical_features:
      dataframe[col] = dataframe[col].apply(lambda x: ' '.join([lemma.lemmatize(word) for word in x.split() if word not in stop]))

  # detect language
  for col in categorical_features:
      new_col = col + '_lang'
      dataframe[new_col] = dataframe[col].apply(lambda x: cld2.detect(x)[-1][0][0] if len(x) < 5000 else 'text has more than 5k characters')

  return dataframe

## 2.3 Apply Preprocessing

In [None]:
train_preprocessed = train.copy()
train_preprocessed = text_preprocessing(train_preprocessed)

In [None]:
train_reviews_preprocessed = train_reviews.copy()
train_reviews_preprocessed = text_preprocessing(train_reviews_preprocessed)

# 3. Feature Engineering (Sentiment Analysis on Comments)

- so far only sentiment analysis for englisch comments

In [None]:
def sentiment_analysis(dataframe, column='comments', language_column='comments_lang', compound=False):
    '''
    Sentiment analysis using vaderSentiment
    Input: dataframe
    Output: dataframe with sentiment column
    '''
    new_column = column + '_sentiment'
    sia = SentimentIntensityAnalyzer()

    # return sentiment label with highest score
    if compound is False:
      dataframe[new_column] = dataframe[column].apply(lambda x: sia.polarity_scores(x))
      dataframe[new_column] = dataframe[new_column].apply(lambda x: {key:val for key,val in x.items() if key != 'compound'})
      dataframe[new_column] = dataframe[new_column].apply(lambda x: max(x, key=x.get))
      # set empty string comments from 'neg' (negative) to 'neu' (neutral)
      dataframe[new_column] = ['neu' if y == '' else x for x,y in zip(dataframe[new_column], dataframe[column])]
      # exception for comments which are not in english
      dataframe[new_column] = ['neu' if y != 'ENGLISH' else x for x,y in zip(dataframe[new_column], dataframe[language_column])]

    # compound score is a combined score that ranges from -1 to 1
    # higher values indicating more positive sentiment
    elif compound is True:
      dataframe[new_column] = dataframe[column].apply(lambda x: sia.polarity_scores(x)['compound'])
      # exception for comments which are not in english
      dataframe[new_column] = [0 if y != 'ENGLISH' else x for x,y in zip(dataframe[new_column], dataframe[language_column])]

    return dataframe

In [None]:
train_reviews_sentiment = train_reviews_preprocessed.copy()
train_reviews_sentiment = sentiment_analysis(train_reviews_sentiment)

In [None]:
train_reviews_sentiment_compound = train_reviews_preprocessed.copy()
train_reviews_sentiment_compound = sentiment_analysis(train_reviews_sentiment_compound, compound=True)

In [None]:
train_preprocessed.to_csv('Data/train_preprocessed.csv', index=False)
train_reviews_sentiment.to_csv('Data/train_reviews_sentiment.csv', index=False)
train_reviews_sentiment_compound.to_csv('Data/train_reviews_sentiment_compound.csv', index=False)

# 4. Train-Test Split

In [2]:
# read in preprocessed data (so preprocessing doesn't have to be done again)
# keep_default_na=False to prevent empty strings from being read in as NaN
train_preprocessed = pd.read_csv('Data/train_preprocessed.csv', keep_default_na=False)
train_reviews_sentiment = pd.read_csv('Data/train_reviews_sentiment.csv', keep_default_na=False)
train_reviews_sentiment_compound = pd.read_csv('Data/train_reviews_sentiment_compound.csv', keep_default_na=False)

## 5.1 Combine Text

In [3]:
# combine all text for an airbnb and create one BoW per airbnb

def combine_text(dataframe1=train_preprocessed, dataframe2=train_reviews_sentiment):
    '''
    Combine all text for an airbnb
    Input: dataframe1 (train_preprocessed), dataframe2 (train_reviews_sentiment)
    Output: combined dataframe
    Output format: 'unlisted', 'text'
    '''
    # prepare dataframe1
    # combine description and host_about
    dataframe1['text'] = [x + ' ' + y for x,y in zip(dataframe1['description'], dataframe1['host_about'])]
    # drop description, host_about, description_lang, host_about_lang
    dataframe1 = dataframe1.drop(['description', 'host_about', 'description_lang', 'host_about_lang'], axis=1)

    # prepare dataframe2
    # combine all text comments for an index
    aggregated_comments = pd.DataFrame(dataframe2.groupby('index')['comments'].agg(lambda x: ' '.join(x))).reset_index()
    # combine all sentiment labels for an index
    aggregated_sentiment_label = pd.DataFrame(dataframe2.groupby('index')['comments_sentiment'].agg(lambda x: ' '.join(x))).reset_index()
    # add sentiment_labels to aggregated_comments
    aggregated_comments['comments'] = [x + ' ' + y for x,y in zip(aggregated_comments['comments'], aggregated_sentiment_label['comments_sentiment'])]
    dataframe2 = aggregated_comments

    # merge dataframe1 and dataframe2 on index
    combined = pd.merge(dataframe1, dataframe2, on='index', how='left')
    # fill empty cells for missing comments
    combined = combined.fillna('')
    # combine all text for an airbnb
    combined['text'] = [x + ' ' + y for x,y in zip(combined['text'], combined['comments'])]
    # drop comments
    combined = combined.drop(['comments'], axis=1)
    # set index to 'index' or airbnb id
    combined = combined.set_index('index')

    return combined

In [4]:
train_preprocessed_to_combine, train_reviews_sentiment_to_combine = train_preprocessed.copy(), train_reviews_sentiment.copy()
train_combined = combine_text(train_preprocessed_to_combine, train_reviews_sentiment_to_combine)
train_combined.head(3)

Unnamed: 0_level_0,unlisted,text
index,Unnamed: 1_level_1,Unnamed: 2_level_1
1,0,shared mixed room hostel shared bathroom locat...
2,1,meu espaço fica perto de parque eduardo vii sa...
3,1,trafaria house cozy familiar villa facility ne...


## 5.2 Train-Test Split

In [5]:
x, y = train_combined['text'], train_combined['unlisted']
xtrain, xval, ytrain, yval = train_test_split(x, y, test_size=0.3, stratify=y, random_state=420)

# 5. Encoding

## 5.2 Bag-of-Words

## 5.3 TF-IDF

In [6]:
# initialize tfidf vectorizer
word_tfidf = TfidfVectorizer(max_df=0.8, ngram_range=(1, 1))

# transform xtrain and xval
xtrain_word = word_tfidf.fit_transform(xtrain)
xval_word = word_tfidf.transform(xval)

## 5.4 Word Embeddings

# 6. Modelling

## 6.1 Baseline Model

### 6.1.1 Logistic Regression

In [7]:
logreg = LogisticRegression()
# fit model on training data
logreg.fit(xtrain_word, ytrain)
# make predictions on validation data
ypred = logreg.predict(xval_word)
print(classification_report(yval, ypred))

              precision    recall  f1-score   support

           0       0.92      0.91      0.91      2710
           1       0.77      0.79      0.78      1039

    accuracy                           0.88      3749
   macro avg       0.85      0.85      0.85      3749
weighted avg       0.88      0.88      0.88      3749



### 6.1.2 RandomForest

In [8]:
rf = RandomForestClassifier()
# fit model on training data
rf.fit(xtrain_word, ytrain)
# make predictions on validation data
ypred = rf.predict(xval_word)
# print classification report
print(classification_report(yval, ypred))

              precision    recall  f1-score   support

           0       0.93      0.91      0.92      2710
           1       0.78      0.81      0.79      1039

    accuracy                           0.88      3749
   macro avg       0.85      0.86      0.86      3749
weighted avg       0.89      0.88      0.88      3749



## 6.2 Neural Networks

### 6.2.1 Recurrent Neural Networks

### 6.2.2 LSTM

## 6.3 Transfer Learning