# Extracting value from the payment notes

In [41]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn
import matplotlib.pyplot as plt 
%matplotlib inline
import emoji
import regex
import nltk
from nltk import FreqDist
from nltk import word_tokenize
from nltk.corpus import stopwords
import string
from emoji.unicode_codes import UNICODE_EMOJI as ue

In [2]:
# load the above mentioned pickle
with open('initial_5pct_transactions.pkl', 'rb') as f:
    initial_5pct = pickle.load(f)

In [10]:
# Retrieve information to connect to the database
keys = fn.get_keys("/Users/jjherranzsarrion/.secret/local_info.json")
username = keys['username']
password = keys['password']

# Select your time ranges and add them in variables
train_window_end = '2018-07-28 23:59:59'
test_window_start = '2018-07-29 00:00:00'
test_window_end = '2018-07-29 23:59:59'
previous_day_start = '2018-07-28 00:00:00'

In [11]:
cursor = fn.extracting_cursor(username, password)
q = f"""SELECT *
        FROM payments p
        WHERE p.date_created <= CAST('{train_window_end}' AS timestamp);"""
cursor.execute(q)
payments = pd.DataFrame(cursor.fetchall())
payments.columns = [x[0] for x in cursor.description]

In [12]:
notes = payments['note']

In [13]:
payments['note'].value_counts()

Food                                 2877
🏠💸                                   2204
🍕                                    2185
Uber                                 1951
🍺                                    1355
                                     ... 
💙🌵                                      1
I just broke your glasses.. sorry       1
🐦🏡                                      1
ZZB                                     1
Insurance & phone                       1
Name: note, Length: 135985, dtype: int64

In [503]:
def split_count(notes):
    """Function that takes in all the notes and returns the emojis used
    in unicode."""
    emoji_dict = {}
    recomposed_note = []
    for note in notes:
        note_text = []
        data = regex.findall(r'\X', note)
        for word in data:
            if any(char in emoji.UNICODE_EMOJI for char in word):
                unicode_emoji = word.encode('unicode-escape').decode('ASCII')
                emoji_dict[word] = unicode_emoji.lower()
                note_text.append(unicode_emoji+' ')
            else:
                note_text.append(word)
        recomposed_note.append(''.join(note_text))
    return recomposed_note, emoji_unicode

In [504]:
recomposed_note, emoji_unicode = split_count(notes[:5])

In [505]:
emoji_unicode

{'👕': '\\u0001f455', '📱': '\\u0001f4f1', '💸': '\\u0001f4b8'}

In [506]:
recomposed_note

['for utilities',
 '\\U0001f455 !',
 'Thank you!',
 '\\U0001f4f1 \\U0001f4b8 ',
 'Mt Dew & candy']

In [507]:
pattern = "([a-zA-Z0-9\\\]+(?:'[a-z]+)?)"
recomposed_note_raw = []
for note in recomposed_note:
    recomposed_note_raw.append(nltk.regexp_tokenize(note, pattern))

In [508]:
recomposed_note_raw

[['for', 'utilities'],
 ['\\U0001f455'],
 ['Thank', 'you'],
 ['\\U0001f4f1', '\\U0001f4b8'],
 ['Mt', 'Dew', 'candy']]

In [509]:
stopwords_list = stopwords.words('english')
stopwords_list += list(string.punctuation)
# additional slang and informal versions of the original words had to be added to the corpus.
stopwords_list += (["im", "ur", "u", "'s", "n", "z", "n't", "brewskies", "mcd’s", "Ty$",
                    "Diploooooo", "thx", "Clothessss", "K2", "B", "Comida", "yo", "jobby",
                    "F", "jus", "bc", "queso", "fil", "Lol", "EZ", "RF", "기프트카드", "감사합니다",
                    "Bts", "youuuu", "X’s", "bday", "WF", "Fooooood", "Yeeeeehaw", "temp",
                    "af", "Chipoodle", "Hhuhhyhy", "Yummmmers", "MGE", "O", "Coook", "wahoooo",
                    "Cuz", "y", "Cutz", "Lax", "LisBnB", "vamanos", "vroom", "Para", "el", "8==",
                    "bitchhh", "¯\\_(ツ)_/¯", "Ily", "CURRYYYYYYY", "Depósito", "Yup", "Shhhhh"])

recomposed_note_stopped = []
for note in recomposed_note_raw:
    recomposed_note_stopped.append([w.lower() for w in note if w not in stopwords_list])

In [510]:
recomposed_note_stopped

[['utilities'],
 ['\\u0001f455'],
 ['thank'],
 ['\\u0001f4f1', '\\u0001f4b8'],
 ['mt', 'dew', 'candy']]

In [519]:
recomposed_note_stopped_em = []
for note in recomposed_note_stopped:
    note_list = []
    for word in note:
        if word.startswith('\\'):
            for key, val in emoji_unicode.items():
                if word == val:
                    note_list.append(key)
        else:
             note_list.append(word)
    recomposed_note_stopped_em.append(note_list)

In [520]:
recomposed_note_stopped_em

[['utilities'], ['👕'], ['thank'], ['📱', '💸'], ['mt', 'dew', 'candy']]

In [528]:
def split_count(notes_list):
    """Function that takes in all the notes and returns the emojis used
    in the form of text captured by :colons:"""
    recomposed_note = []
    for notes in notes_list:
        note_list = []
        for note in notes:
            note_text = []
            data = regex.findall(r'\X', note)
            for word in data:
                if any(char in emoji.UNICODE_EMOJI for char in word):
                    note_text.append(emoji.demojize(f'{word}'))
                else:
                    note_text.append(word)
            note_list.append(''.join(note_text))
        recomposed_note.append(note_list)
    return recomposed_note

In [529]:
fully_recomposed_notes = split_count(recomposed_note_stopped_em)

In [530]:
fully_recomposed_notes

[['utilities'],
 [':t-shirt:'],
 ['thank'],
 [':mobile_phone:', ':money_with_wings:'],
 ['mt', 'dew', 'candy']]

In [None]:
payments['recomposed_note'] = [note for note in split_count(notes)]

In [None]:
payments

In [None]:
from nltk.stem.wordnet import WordNetLemmatizer

In [None]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [None]:
# Init Lemmatizer
lemmatizer = WordNetLemmatizer()

In [None]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

In [None]:
sentence = payments['recomposed_note'][0]
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

In [None]:
lemmatized_notes = []
for sentence in payments['recomposed_note']:
    lemmatized_notes.append([lemmatizer.lemmatize(word, get_wordnet_pos(word)) 
                             for word in nltk.word_tokenize(sentence)])

In [None]:
lemmatized_notes[-1]

In [None]:
recomposed_note_freqdist = FreqDist(recomposed_note)
recomposed_note_freqdist.most_common(10)

In [None]:
recomposed_note_stopped

In [None]:
payments['note'].value_counts()