# Extracting value from the payment notes

In [1]:
import pandas as pd
import numpy as np
import psycopg2
import pymongo
import json
import datetime
import pickle
import functions as fn
import matplotlib.pyplot as plt 
%matplotlib inline
import emoji
import regex
import nltk
from nltk import FreqDist
from nltk import word_tokenize
from nltk.corpus import stopwords

In [2]:
# load the above mentioned pickle
with open('initial_5pct_transactions.pkl', 'rb') as f:
    initial_5pct = pickle.load(f)

In [3]:
# Retrieve information to connect to the database
keys = fn.get_keys("/Users/jjherranzsarrion/.secret/local_info.json")
username = keys['username']
password = keys['password']

# Select your time ranges and add them in variables
train_window_end = '2018-07-28 23:59:59'
test_window_start = '2018-07-29 00:00:00'
test_window_end = '2018-07-29 23:59:59'
previous_day_start = '2018-07-28 00:00:00'

In [4]:
cursor = fn.extracting_cursor(username, password)
q = f"""SELECT *
        FROM payments p;"""
cursor.execute(q)
payments = pd.DataFrame(cursor.fetchall())
payments.columns = [x[0] for x in cursor.description]

In [6]:
notes = payments['note']

In [7]:
payments['note'].value_counts()

🏠💸                                                         22188
Food                                                       18393
🍕                                                          13313
Uber                                                       11323
⛽                                                           8981
                                                           ...  
A nice 5 dollars.                                              1
My drunk payment plus request for the shot...holy wow♥️        1
f饭                                                             1
Bike Tour                                                      1
Axlllllll!                                                     1
Name: note, Length: 768892, dtype: int64

In [8]:
def split_count(notes):
    """Function that takes in all the notes and returns the emojis used
    in the form of text captured by :colons:"""
    recomposed_note = []
    for note in notes:
        note_text = []
        data = regex.findall(r'\X', note)
        for word in data:
            if any(char in emoji.UNICODE_EMOJI for char in word):
                note_text.append(emoji.demojize(f'{word}'))
            else:
                note_text.append(word)
        recomposed_note.append(''.join(note_text))
    return recomposed_note

In [9]:
recomposed_note = split_count(notes)

In [12]:
recomposed_note

['for utilities',
 ':t-shirt:!',
 'Thank you!',
 ':mobile_phone::money_with_wings:',
 'Mt Dew & candy',
 'Pho',
 ':chocolate_bar::soft_ice_cream::glass_of_milk:',
 'Lashes :face_blowing_a_kiss:',
 ':cocktail_glass:',
 ':wrapped_gift:',
 'Stuff',
 'Phils',
 ':woman_dancing_light_skin_tone::woman_dancing_light_skin_tone::woman_dancing_light_skin_tone::woman_dancing_light_skin_tone:',
 'Eye cream',
 ':honeybee::ear:',
 'I cant read good',
 ':baby:',
 'From Grandaddy',
 'Don’t let me spend this',
 '7/23 :smiling_face_with_smiling_eyes:',
 'I cant read good',
 'National grid',
 'A romantic evening in a cabin :red_heart::red_heart::red_heart:',
 'Cooper and thanks!!!',
 'Personal UPS',
 'Tree Branch Cider House :clinking_beer_mugs:',
 'Your welcome.',
 'Pens',
 'as many brewskies as this can buy. happy birthday!',
 'Beach day',
 'Buy me bagels',
 'Different Scoop!',
 'Food',
 ':hot_beverage::hamburger: Chinatown',
 'Trash',
 ':automobile:',
 '@Melissa-Spencer-23 payment',
 ':venmo_dollar:',


In [13]:
payments['recomposed_note'] = [note for note in split_count(notes)]

In [14]:
payments

Unnamed: 0,index,note,action,status,date_created,audience,date_completed,target_type,target_user_id,actor_id,payment_id,recomposed_note
0,0,for utilities,pay,settled,2018-07-26 18:48:10,public,2018-07-26T18:48:10,user,1572642482028544167,2206066431492096327,2532209455660008361,for utilities
1,1,👕!,pay,settled,2018-07-26 18:48:08,public,2018-07-26T18:48:08,user,2242966299082752545,2200417693859840681,2532209439595823434,:t-shirt:!
2,2,Thank you!,pay,settled,2018-07-26 18:48:08,public,2018-07-26T18:48:08,user,1984520039432192983,2373608382922752189,2532209440686343010,Thank you!
3,3,📱💸,pay,settled,2018-07-26 18:48:08,public,2018-07-26T18:48:08,user,1780528822878208201,1670504276557824171,2532209443756573591,:mobile_phone::money_with_wings:
4,4,Mt Dew & candy,pay,settled,2018-07-26 18:48:08,public,2018-07-26T18:48:09,user,2496761604079616999,1957419122950144676,2532209445073584640,Mt Dew & candy
...,...,...,...,...,...,...,...,...,...,...,...,...
1769141,1769141,For expenses on splitwise.com,pay,settled,2018-10-02 13:46:28,public,2018-10-02T13:46:28,user,1761973221982208918,1944598771400704529,2581342356175323552,For expenses on splitwise.com
1769142,1769142,🛋,charge,settled,2018-10-02 13:46:28,public,2018-10-02T13:46:28,user,1995010262171648016,2185557107015680994,2581342357517501181,:couch_and_lamp:
1769143,1769143,Game Pass!!!,pay,settled,2018-10-02 13:46:28,public,2018-10-02T13:46:29,user,1880045110951936595,2372803185606656831,2581342361015550468,Game Pass!!!
1769144,1769144,4-5 🌭,pay,settled,2018-10-02 13:46:28,public,2018-10-02T13:46:29,user,1651668814921728056,2200260675895296778,2581342361242042754,4-5 :hot_dog:


In [15]:
from nltk.stem.wordnet import WordNetLemmatizer

In [17]:
# Lemmatize with POS Tag
from nltk.corpus import wordnet

def get_wordnet_pos(word):
    """Map POS tag to first character lemmatize() accepts"""
    tag = nltk.pos_tag([word])[0][1][0].upper()
    tag_dict = {"J": wordnet.ADJ,
                "N": wordnet.NOUN,
                "V": wordnet.VERB,
                "R": wordnet.ADV}

    return tag_dict.get(tag, wordnet.NOUN)

In [18]:
# Init Lemmatizer
lemmatizer = WordNetLemmatizer()

In [26]:
nltk.download('averaged_perceptron_tagger')
nltk.download('wordnet')
nltk.download('punkt')

[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /Users/jjherranzsarrion/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/jjherranzsarrion/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!
[nltk_data] Downloading package punkt to
[nltk_data]     /Users/jjherranzsarrion/nltk_data...
[nltk_data]   Unzipping tokenizers/punkt.zip.


True

In [32]:
sentence = payments['recomposed_note'][0]
print([lemmatizer.lemmatize(w, get_wordnet_pos(w)) for w in nltk.word_tokenize(sentence)])

['for', 'utility']


In [34]:
lemmatized_notes = []
for sentence in payments['recomposed_note']:
    lemmatized_notes.append([lemmatizer.lemmatize(word, get_wordnet_pos(word)) 
                             for word in nltk.word_tokenize(sentence)])

In [37]:
lemmatized_notes[-1]

[':', 'dog_face', ':']

In [16]:
recomposed_note_freqdist = FreqDist(recomposed_note)
recomposed_note_freqdist.most_common(10)

[(':house::money_with_wings:', 22188),
 ('Food', 18393),
 (':pizza:', 13313),
 ('Uber', 11323),
 (':fuel_pump:', 10860),
 (':house:', 8668),
 ('Rent', 8392),
 (':beer_mug:', 6644),
 (':fork_and_knife:', 6644),
 (':hot_beverage:', 5787)]

In [None]:
stopwords_list = stopwords.words('english')
stopwords_list += ["im", "ur", "u", "'s", "n", "z", "n't"]

recomposed_note_stopped = [note for note in recomposed_note if note not in stopwords_list]

In [None]:
recomposed_note_stopped

In [None]:
payments['note'].value_counts()