# Feature engineering in description field

  * Topic Modelling

In [1]:
# For Analysis
import numpy as np
import pandas as pd

# For Visualizations
import matplotlib.pyplot as plt
import seaborn as sns

#For Modeling
from sklearn import metrics
from sklearn.model_selection import RandomizedSearchCV,train_test_split,GridSearchCV
from sklearn.linear_model import LogisticRegression, LinearRegression
from sklearn.ensemble import RandomForestRegressor
import xgboost as xgb
from xgboost import plot_importance
from sklearn.metrics import explained_variance_score, mean_squared_error, r2_score

#Hide warnings
import warnings
warnings.filterwarnings('ignore')
%matplotlib inline
from datetime import datetime


# for feature engineering
from sklearn import linear_model
from sklearn import preprocessing
import matplotlib.pyplot as plt
#from haversine import haversine 
from progressbar import ProgressBar
import yaml

## for text field processing
import nltk
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from sklearn import datasets, feature_extraction, decomposition
from nltk.stem.porter import PorterStemmer

import gensim
from gensim import corpora
import re
import nltk
from nltk.corpus import stopwords
from stop_words import get_stop_words
import operator



In [32]:
!pip install pyLDAvis
import pyLDAvis
import pyLDAvis.gensim
nltk.download('vader_lexicon')
!pip install stop-words
import stop_words
from stop_words import get_stop_words
nltk.download('punkt')

# Importing data

In [6]:
#!!wget https://www.dropbox.com/s/2pyir77wrhd6f4u/case_study.zip

In [7]:
#!unzip /content/case_study.zip

In [8]:
# importing train data
train_data=pd.read_csv('/content/train.csv')

# importing test data
test_data=pd.read_excel('/content/test.xlsx')

In [12]:
train_data.shape , test_data.shape

((49999, 31), (24111, 29))

In [23]:
# Droping unknown column

train_data.drop(' ',axis=1,inplace=True)
test_data.drop(' ',axis=1,inplace=True)

In [11]:
## Finding sentiment intensity score for description field in train data
## saving "compound" score to listings dataframe

train_data["Description_Length"] = np.nan
sent_int = SentimentIntensityAnalyzer()

for i in range(len(train_data["description"])):
    s = train_data['description'][i]
    
    ## generating sentiment intensity score for descriptions
    sentiment = sent_int.polarity_scores(s)
    train_data.loc[i, "Sentiment_intensity"] = sentiment["compound"]
    
    ## Findig the length of the description
    train_data.loc[i,"Description_Length"] = len(train_data["description"][i].split())





In [14]:
## Topic modelling on description using LDA in train data
stop = get_stop_words('en')
add_stop_word = ['etc', 'well', 'll', 'get', 'within, ''please', 'good', 'many', 'l', 'g', 'c', 
                 'want']
stop.extend(add_stop_word)
desc = []

## Creating a list of description
desc_list = [train_data["description"][i] for i in range(len(train_data["description"]))]

for i in desc_list:
    ## tokanizing descrption into words
    desc_list_w = nltk.tokenize.word_tokenize(i)
    
    ## converting all words to lower case
    desc_lower = [w.lower() for w in desc_list_w]
    
    ## removing numerical values and special characters
    desc_words = [w for w in desc_lower if re.search('^[a-z]+$',w)]
    
    ## removing stop words
    desc_stopremoved = [w for w in desc_words if w not in stop]
    
    ## add tokens to list
    desc.append(desc_stopremoved)

In [15]:
## Gensim's Dictionary encapsulates the mapping between words and their integer ids. 
desc_dict = corpora.Dictionary(desc)

## filter out words which appear in less than 20 documents or more than 15% documents
desc_dict.filter_extremes(no_below=20, no_above=0.15) 
print(desc_dict)

## creating bag of words
corpus = [desc_dict.doc2bow(d) for d in desc]
len(corpus)

Dictionary(6243 unique tokens: ['bridge', 'brooklyn', 'brownstone', 'food', 'promenade']...)


49999

In [22]:
## training an LDA model
lda_model = gensim.models.LdaModel(corpus, alpha='auto', num_topics=5, id2word=desc_dict)

In [17]:
## printing topics for lda model
lda_model.show_topics(num_topics=5,num_words=30)

[(0,
  '0.007*"fully" + 0.007*"towels" + 0.007*"guest" + 0.007*"closet" + 0.007*"cable" + 0.007*"microwave" + 0.006*"use" + 0.006*"unit" + 0.006*"bath" + 0.006*"dryer" + 0.005*"mattress" + 0.005*"shower" + 0.005*"internet" + 0.005*"studio" + 0.005*"entrance" + 0.005*"air" + 0.005*"equipped" + 0.005*"sofa" + 0.005*"provided" + 0.005*"beds" + 0.005*"shared" + 0.005*"dining" + 0.004*"small" + 0.004*"washer" + 0.004*"includes" + 0.004*"table" + 0.004*"furnished" + 0.004*"netflix" + 0.004*"flat" + 0.004*"separate"'),
 (1,
  '0.013*"business" + 0.012*"couples" + 0.012*"travelers" + 0.012*"solo" + 0.011*"adventurers" + 0.011*"metro" + 0.008*"bus" + 0.008*"station" + 0.008*"square" + 0.008*"line" + 0.008*"san" + 0.007*"francisco" + 0.007*"center" + 0.007*"dc" + 0.006*"minute" + 0.006*"within" + 0.006*"easy" + 0.006*"central" + 0.006*"public" + 0.005*"district" + 0.005*"chicago" + 0.005*"families" + 0.005*"transportation" + 0.004*"hill" + 0.004*"shops" + 0.004*"market" + 0.004*"steps" + 0.004*"

In [18]:
## generating topic describution over each description

train_data["topic_general"] = 0.0        ## for topic 0
train_data["Topic_utilities"] = 0.0      ## for topic 1
train_data["Topic_Transport"] = 0.0      ## for topic 2
train_data["Topic_listing_desc"] = 0.0   ## for topic 3
train_data["Topic_attractions"] = 0.0    ## for topic 4

## defining topic_name to assign topic names
def topic_name(i):
        switcher={
                0:'topic_general',
                1:'Topic_utilities',
                2:'Topic_Transport',
                3:'Topic_listing_desc',
                4:'Topic_attractions',
             }
        return switcher.get(i,"Invalid")
        
for d in range(len(train_data["description"])):
    bow = desc_dict.doc2bow(desc[d])
    t = lda_model.get_document_topics(bow)
    
    for i in range(len(t)):
        #print("description", d, ":", t[i][0], t[i][1])
        temp = topic_name(t[i][0])
        train_data.loc[d,temp] = t[i][1]
    #print("------")

# doing same process for Test data

In [26]:
## Finding sentiment intensity score for description field in test data
## saving "compound" score to listings dataframe

test_data["Description_Length"] = np.nan
sent_int = SentimentIntensityAnalyzer()

for i in range(len(test_data["description"])):
    s = test_data['description'][i]
    
    ## generating sentiment intensity score for descriptions
    sentiment = sent_int.polarity_scores(str(s))
    test_data.loc[i, "Sentiment_intensity"] = sentiment["compound"]
    
    ## Findig the length of the description
    test_data.loc[i,"Description_Length"] = len(str(test_data["description"][i]).split())

In [28]:
## Topic modelling on description using LDA
stop = get_stop_words('en')
add_stop_word = ['etc', 'well', 'll', 'get', 'within, ''please', 'good', 'many', 'l', 'g', 'c', 
                 'want']
stop.extend(add_stop_word)
desc = []

## Creating a list of description
desc_list = [test_data["description"][i] for i in range(len(test_data["description"]))]

for i in desc_list:
    ## tokanizing descrption into words
    desc_list_w = nltk.tokenize.word_tokenize(str(i))
    
    ## converting all words to lower case
    desc_lower = [w.lower() for w in desc_list_w]
    
    ## removing numerical values and special characters
    desc_words = [w for w in desc_lower if re.search('^[a-z]+$',w)]
    
    ## removing stop words
    desc_stopremoved = [w for w in desc_words if w not in stop]
    
    ## add tokens to list
    desc.append(desc_stopremoved)

In [29]:
## Gensim's Dictionary encapsulates the mapping between words and their integer ids. 
desc_dict = corpora.Dictionary(desc)

## filter out words which appear in less than 20 documents or more than 15% documents
desc_dict.filter_extremes(no_below=20, no_above=0.15) 
print(desc_dict)

## creating bag of words
corpus = [desc_dict.doc2bow(d) for d in desc]
len(corpus)

Dictionary(4384 unique tokens: ['ac', 'appliances', 'around', 'brother', 'bushwick']...)


24111

In [30]:
## generating topic describution over each description in test data

test_data["topic_general"] = 0.0        ## for topic 0
test_data["Topic_utilities"] = 0.0      ## for topic 1
test_data["Topic_Transport"] = 0.0      ## for topic 2
test_data["Topic_listing_desc"] = 0.0   ## for topic 3
test_data["Topic_attractions"] = 0.0    ## for topic 4

## defining topic_name to assign topic names
def topic_name(i):
        switcher={
                0:'topic_general',
                1:'Topic_utilities',
                2:'Topic_Transport',
                3:'Topic_listing_desc',
                4:'Topic_attractions',
             }
        return switcher.get(i,"Invalid")
        
for d in range(len(test_data["description"])):
    bow = desc_dict.doc2bow(desc[d])
    t = lda_model.get_document_topics(bow)
    
    for i in range(len(t)):
        #print("description", d, ":", t[i][0], t[i][1])
        temp = topic_name(t[i][0])
        test_data.loc[d,temp] = t[i][1]
    #print("------")

In [31]:
train_data.to_csv('train_LDA.csv',index=False)
test_data.to_csv('test_LDA.csv',index=False)