In [1]:
# to manipulate dataframes
import pandas as pd

# natural language processing: n-gram ranking
import re
import unicodedata
import nltk
nltk.download('stopwords')
nltk.download('wordnet')

# for natural language processing: named entity recognition
import spacy
from collections import Counter

# for natural language processing: sentiment analysis
from nltk.sentiment.vader import SentimentIntensityAnalyzer
from nltk.sentiment.util import *
from nltk import word_tokenize, sent_tokenize

# add appropriate words that will be ignored in the analysis
ADDITIONAL_STOPWORDS = ['kiva']

[nltk_data] Downloading package stopwords to
[nltk_data]     /Users/ednalyndedios/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package wordnet to
[nltk_data]     /Users/ednalyndedios/nltk_data...
[nltk_data]   Package wordnet is already up-to-date!


In [2]:
# get the data; text data MUST be in the first column
df = pd.read_csv('../data/input/kiva_loans.csv')

In [3]:
df.head()

Unnamed: 0,id,funded_amount,loan_amount,activity,sector,use,country_code,country,region,currency,partner_id,posted_time,disbursed_time,funded_time,term_in_months,lender_count,tags,borrower_genders,repayment_interval,date
0,653051,300.0,300.0,Fruits & Vegetables,Food,"To buy seasonal, fresh fruits to sell.",PK,Pakistan,Lahore,PKR,247.0,2014-01-01 06:12:39+00:00,2013-12-17 08:00:00+00:00,2014-01-02 10:06:32+00:00,12.0,12,,female,irregular,2014-01-01
1,653053,575.0,575.0,Rickshaw,Transportation,to repair and maintain the auto rickshaw used ...,PK,Pakistan,Lahore,PKR,247.0,2014-01-01 06:51:08+00:00,2013-12-17 08:00:00+00:00,2014-01-02 09:17:23+00:00,11.0,14,,"female, female",irregular,2014-01-01
2,653068,150.0,150.0,Transportation,Transportation,To repair their old cycle-van and buy another ...,IN,India,Maynaguri,INR,334.0,2014-01-01 09:58:07+00:00,2013-12-17 08:00:00+00:00,2014-01-01 16:01:36+00:00,43.0,6,"user_favorite, user_favorite",female,bullet,2014-01-01
3,653063,200.0,200.0,Embroidery,Arts,to purchase an embroidery machine and a variet...,PK,Pakistan,Lahore,PKR,247.0,2014-01-01 08:03:11+00:00,2013-12-24 08:00:00+00:00,2014-01-01 13:00:00+00:00,11.0,8,,female,irregular,2014-01-01
4,653084,400.0,400.0,Milk Sales,Food,to purchase one buffalo.,PK,Pakistan,Abdul Hakeem,PKR,245.0,2014-01-01 11:53:19+00:00,2013-12-17 08:00:00+00:00,2014-01-01 19:18:51+00:00,14.0,16,,female,monthly,2014-01-01


In [4]:
# removes null values
df = df.loc[df.use.notnull()]

In [5]:
df = df[['id', 'funded_amount', 'loan_amount', 'activity', 'sector', 'use', 'country_code', 'borrower_genders', 'date']]

In [6]:
df.shape

(666973, 9)

In [7]:
df = df.sample(5000, random_state=493).reset_index(drop=True)

In [8]:
df.shape

(5000, 9)

In [9]:
def clean(text):
  """
  A simple function to clean up the data. All the words that
  are not designated as a stop word is then lemmatized after
  encoding and basic regex parsing are performed.
  """
  wnl = nltk.stem.WordNetLemmatizer()
  stopwords = nltk.corpus.stopwords.words('english')+ ADDITIONAL_STOPWORDS
  text = (unicodedata.normalize('NFKD', text)
    .encode('ascii', 'ignore')
    .decode('utf-8', 'ignore')
    .lower())
  words = re.sub(r'[^\w\s]', '', text).split()
  return [wnl.lemmatize(word) for word in words if word not in stopwords]

def get_bigrams(content):
    """
    Takes in a list of words and returns a dataframe with the top 20 bigrams.
    """
    bigrams = (pd.Series(nltk.ngrams(content, 2)).value_counts())[:10].to_frame().reset_index()
    bigrams.columns=['bigram', 'count']
    return bigrams

def get_trigrams(content):
    """
    Takes in a list of words and returns a dataframe with the top 20 trigrams.
    """
    trigrams = (pd.Series(nltk.ngrams(content, 3)).value_counts())[:10].to_frame().reset_index()
    trigrams.columns=['trigram', 'count']
    return trigrams

In [10]:
# converts to a list of clean tokens
content = clean(''.join(str(df.use.tolist())))

# N-Gram Ranking

In [11]:
# TODO: push to S3
# exports to a flat file
get_bigrams(content).to_csv('../data/output/output_data_ngram_bigrams.csv', index=False)

In [12]:
# TODO: push to S3
# exports to a flat file
get_trigrams(content).to_csv('../data/output/output_data_ngram_trigrams.csv', index=False)

# Named-Entity Recognition

In [13]:
def get_entities(entities, ent_type):
    group_list = []
    for entity in entities:
        if entity.label_ == ent_type:
            group_list.append(entity.text)
    df_entities = pd.DataFrame(Counter(group_list).most_common(20))
    df_entities.columns=['entity', 'count']
    return df_entities

In [14]:
nlp = spacy.load("en_core_web_sm", disable=["tagger", "parser", "textcat"])

OSError: [E050] Can't find model 'en_core_web_sm'. It doesn't seem to be a shortcut link, a Python package or a valid path to a data directory.