In [1]:
## Importing necessary libraraies ##

import re, string, unicodedata
import nltk
import contractions
import inflect
from bs4 import BeautifulSoup
from nltk import word_tokenize, sent_tokenize
from nltk.corpus import stopwords
from nltk.stem import LancasterStemmer, WordNetLemmatizer

In [2]:
### Fetching data from mongo ###

from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db = client.apps
collection = db.app_name
print(collection)

import pandas as pd
df_apps = pd.DataFrame(list(collection.find({},{'_id':0, 'description': 1, 'category': 1, 'app_name': 1})))
df_apps = df_apps[:]

Collection(Database(MongoClient(host=['localhost:27017'], document_class=dict, tz_aware=False, connect=True), 'apps'), 'app_name')


In [3]:
### Declaring preprocessing steps for text cleaning/pre-processing ###


def remove_between_square_brackets(text):
    """ expanding contractions like didn't haven't etc """
    return re.sub('\[[^]]*\]', '', text)

def replace_contractions(text):
    """Replace contractions in string of text"""
    return contractions.fix(text)

def remove_non_ascii(words):
    """Remove non-ASCII characters from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = unicodedata.normalize('NFKD', word).encode('ascii', 'ignore').decode('utf-8', 'ignore')
        new_words.append(new_word)
    return new_words

def to_lowercase(words):
    """Convert all characters to lowercase from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = word.lower()
        new_words.append(new_word)
    return new_words

def remove_punctuation(words):
    """Remove punctuation from list of tokenized words"""
    new_words = []
    for word in words:
        new_word = re.sub(r'[^\w\s]', '', word)
        if new_word != '':
            new_words.append(new_word)
    return new_words

def replace_numbers(words):
    """Replace all interger occurrences in list of tokenized words with textual representation"""
    p = inflect.engine()
    new_words = []
    for word in words:
        if word.isdigit():
            new_word = p.number_to_words(word)
            new_words.append(new_word)
        else:
            new_words.append(word)
    return new_words

def remove_stopwords(words):
    """Remove stop words from list of tokenized words"""
    new_words = []
    for word in words:
        if word not in stopwords.words('english'):
            new_words.append(word)
    return new_words

def stem_words(words):
    """Stem words in list of tokenized words"""
    stemmer = LancasterStemmer()
    stems = []
    for word in words:
        stem = stemmer.stem(word)
        stems.append(stem)
    return stems

def lemmatize_verbs(words):
    """Lemmatize verbs in list of tokenized words"""
    lemmatizer = WordNetLemmatizer()
    lemmas = []
    for word in words:
        lemma = lemmatizer.lemmatize(word, pos='v')
        lemmas.append(lemma)
    return lemmas

def normalize(words):
    words = remove_non_ascii(words)
    words = to_lowercase(words)
    words = remove_punctuation(words)
    words = replace_numbers(words)
    words = remove_stopwords(words)
    return words


def stem_and_lemmatize(words):
    stems = stem_words(words)
    lemmas = lemmatize_verbs(words)
    return stems, lemmas

In [4]:
## Download wordnet if not exists ##
import nltk
nltk.download('wordnet')

[nltk_data] Downloading package wordnet to /Users/gaurav/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


True

In [5]:
def dataprep_full(row):
    try:
        ### Applying cleaning methods ###
        val = replace_contractions(row['description'])
        val = remove_between_square_brackets(val)
        val = nltk.word_tokenize(val)
        val = remove_non_ascii(val)
        val = to_lowercase(val)
        val = remove_punctuation(val)
        val = replace_numbers(val)
        val = remove_stopwords(val)
        val = stem_words(val)
        val = lemmatize_verbs(val)
        #val = normalize(val)
        val = ' '.join(val)
        return val
    except:
        return 'NA'


In [6]:
## Importing TF-IDF vectorization libraries ##

from sklearn.feature_extraction.text import CountVectorizer
from sklearn.metrics.pairwise import cosine_similarity
from sklearn.feature_extraction.text import TfidfVectorizer

In [7]:
## Getting values for source app ##

test_df = pd.DataFrame(list(collection.find({'app_name': 'Flipkart'},{'_id':0, 'description': 1, 'category': 1, 'app_name': 1})))

In [8]:
### Performing cleaning and data preparation on target and train dataframe ###

df_apps.dropna(subset=['description'], inplace = True)
test_set = test_df.apply(dataprep_full, axis = 1)
train_set = df_apps.apply(dataprep_full, axis = 1)


test_set_2 = test_df['description']
train_set_2 = df_apps['description']
combined_df_series_2 = pd.concat([test_set_2, train_set_2])
combined_df_series = pd.concat([test_set, train_set])
combined_app_names = pd.concat([test_df['app_name'], df_apps['app_name']])


print(len(combined_app_names))
print(len(combined_df_series))
print(len(combined_df_series_2))


979
980
980
980


In [9]:
# Create histogram for lenght of the data #
df_apps["Length"]= df_apps["description"].str.len() 
ax = df_apps['Length'].plot.hist(bins=5, alpha=0.5)
ax

<matplotlib.axes._subplots.AxesSubplot at 0x126910668>

In [10]:
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(combined_df_series)  #finds the tfidf score with normalization
result = cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train) #here the first element of tfidf_matrix_train is matched with other three elements

In [11]:
## Create histogram of lengths to show more analysis for documents
## Pass in only value as name and return name with scores

## Find ways to do this for just one message, instead of all the messages all the time
## 

In [12]:
result_df = pd.DataFrame({'app_name': combined_app_names, 'cosine_dist': result[0]}).sort_values(['cosine_dist'], ascending=False)
result_df.head(10)

Unnamed: 0,app_name,cosine_dist
0,Flipkart,1.0
430,Flipkart,1.0
431,Snapdeal,0.364299
434,HomeShop18,0.351901
447,DHgate Mobile online wholesale,0.297361
449,DHgate Mobile,0.297361
438,Wanelo Shopping,0.280756
429,eBay,0.273683
464,Amazon,0.269543
443,AliExpress Super Deals,0.260423


In [13]:
### Direct approach without vectorization ###
tfidf_vectorizer = TfidfVectorizer()
tfidf_matrix_train = tfidf_vectorizer.fit_transform(combined_df_series_2)  #finds the tfidf score with normalization
result = cosine_similarity(tfidf_matrix_train[0:1], tfidf_matrix_train) #here the first element of tfidf_matrix_train is matched with other three elements

result_df_2 = pd.DataFrame({'app_name': combined_app_names, 'cosine_dist': result[0]}).sort_values(['cosine_dist'], ascending=False)
result_df_2.head(10)

Unnamed: 0,app_name,cosine_dist
0,Flipkart,1.0
430,Flipkart,1.0
431,Snapdeal,0.337076
434,HomeShop18,0.31496
464,Amazon,0.281992
447,DHgate Mobile online wholesale,0.268639
449,DHgate Mobile,0.268639
438,Wanelo Shopping,0.265911
433,Jabong,0.256929
429,eBay,0.229808
