In [1]:
import numpy as np
import pandas as pd

## Punto 1

In [2]:
# before normalizing the loan_lenders.csv table, let's check that each username
# doesn't contain spaces or commas
data_folder_path = 'additional-kiva-snapshot/'
lenders = pd.read_csv(data_folder_path + 'lenders.csv')
pd.set_option('display.max_rows', 10)
lenders

Unnamed: 0,permanent_name,display_name,city,state,country_code,member_since,occupation,loan_because,loan_purchase_num,invited_by,num_invited
0,qian3013,Qian,,,,1461300457,,,1.0,,0
1,reena6733,Reena,,,,1461300634,,,9.0,,0
2,mai5982,Mai,,,,1461300853,,,,,0
3,andrew86079135,Andrew,,,,1461301091,,,5.0,Peter Tan,0
4,nguyen6962,Nguyen,,,,1461301154,,,,,0
...,...,...,...,...,...,...,...,...,...,...,...
2349169,janet7309,Janet,,,,1342097163,,,,,0
2349170,pj4198,,,,,1342097515,,,,,0
2349171,maria2141,Maria,,,US,1342099723,,,2.0,,0
2349172,simone9846,Simone,,,,1342100213,,,,,0


In [3]:
space = lenders['permanent_name'].str.find(' ') == -1
comma = lenders['permanent_name'].str.find(',') == -1
total = space & comma
del space, comma
assert np.any(total), 'Invalid account name'
del total

In [4]:
loan_lenders = pd.read_csv(data_folder_path + 'loans_lenders.csv')
loan_lenders

Unnamed: 0,loan_id,lenders
0,483693,"muc888, sam4326, camaran3922, lachheb1865, reb..."
1,483738,"muc888, nora3555, williammanashi, barbara5610,..."
2,485000,"muc888, terrystl, richardandsusan8352, sherri4..."
3,486087,"muc888, james5068, rudi5955, daniel9859, don92..."
4,534428,"muc888, niki3008, teresa9174, mike4896, david7..."
...,...,...
1387427,678999,"michael43411218, carol5987, gooddogg1, chris41..."
1387428,1207353,"rjhoward1986, jeffrey6870, trolltech4460, elys..."
1387429,1206220,"vicky7746, gooddogg1, fairspirit, craig9729960..."
1387430,1206425,"rich6705, sergiiy9766, angela7509, barbara5610..."


In [5]:
lenders_list = loan_lenders.lenders.str.split(', ')
lengths = [len(l) for l in lenders_list.values]
#np.hstack(prova[0:10])
lengths
norm_loan_lenders = pd.DataFrame({'loan_id': np.repeat(loan_lenders['loan_id'].values, lengths),
                       'lender': np.concatenate(lenders_list)})
norm_loan_lenders

Unnamed: 0,loan_id,lender
0,483693,muc888
1,483693,sam4326
2,483693,camaran3922
3,483693,lachheb1865
4,483693,rebecca3499
...,...,...
28293926,1206425,trogdorfamily7622
28293927,1206425,danny6470
28293928,1206425,don6118
28293929,1206486,alan5175


# Secondo Punto

In [6]:
loans = pd.read_csv(data_folder_path + 'loans.csv', nrows = 100)

Le due colonne di interesse sono inizialmente codificate come stringhe di testo, è necessario convertirle in un formato adatto a dati temporali. Eventuali valori nulli o non validi, sono automaticamente gestiti da Pandas e diventano <pandas._libs.tslibs.nattype.NaTType>. Per esempio:

In [7]:
print('Esempio di dato mancante:', loans['disburse_time'][64])
loans['disburse_time'] = pd.to_datetime(loans['disburse_time'])
loans['planned_expiration_time'] = pd.to_datetime(loans['planned_expiration_time'])
print('... convertito in NaT', loans['disburse_time'][64])

Esempio di dato mancante: 2010-11-18 08:00:00.000 +0000
... convertito in NaT 2010-11-18 08:00:00+00:00


I NaT si propagano come i nan per i float, per cui qualsiasi operazione algebrica su di essi restisce NaT. Calcolando la differenza sottostante quindi le entrate NaT rimarranno tali

In [9]:
print('Propagazione dei NaT:', (loans['disburse_time'] - loans['planned_expiration_time'])[64])

Propagazione dei NaT: NaT


In [10]:
loans['duration'] = loans['disburse_time'] - loans['planned_expiration_time']

In [11]:
loans[['planned_expiration_time', 'disburse_time', 'duration']]

Unnamed: 0,planned_expiration_time,disburse_time,duration
0,2014-02-14 03:30:06+00:00,2013-12-22 08:00:00+00:00,-54 days +04:29:54
1,2014-03-26 22:25:07+00:00,2013-12-20 08:00:00+00:00,-97 days +09:34:53
2,2014-02-15 21:10:05+00:00,2014-01-09 08:00:00+00:00,-38 days +10:49:55
3,2014-02-21 03:10:02+00:00,2014-01-17 08:00:00+00:00,-35 days +04:49:58
4,2014-02-13 06:10:02+00:00,2013-12-17 08:00:00+00:00,-58 days +01:49:58
...,...,...,...
95,2015-11-11 20:50:02+00:00,2015-09-10 07:00:00+00:00,-63 days +10:09:58
96,2015-11-12 14:30:11+00:00,2015-12-07 08:00:00+00:00,24 days 17:29:49
97,2015-11-12 21:10:06+00:00,2015-10-02 07:00:00+00:00,-42 days +09:49:54
98,2015-11-13 12:40:03+00:00,2015-10-01 07:00:00+00:00,-44 days +18:19:57


# Terzo Punto

Quello che segue è il metodo più rapido per ottenere quali utenti abbiano presato denaro più di una volta. Alternativamente, si possono contare le occorrenze di ciascun nome nella tabella norm_loan_lenders, per poi tenere solo quelli con conteggio > 1. Tuttavia, se il database è coerente i due metodi dovrebbero identificare gli stessi utenti. Il primo metodo è più rapido da scrivere ed eseguire, e permette di restituire anche altri dati dei soggetti senza ulteriori query.

In [122]:
lenders[lenders['loan_purchase_num'] > 1]

Unnamed: 0,permanent_name,display_name,city,state,country_code,member_since,occupation,loan_because,loan_purchase_num,invited_by,num_invited
1,reena6733,Reena,,,,1461300634,,,9.0,,0
3,andrew86079135,Andrew,,,,1461301091,,,5.0,Peter Tan,0
6,rene7585,Rene,,,,1461301636,,,2.0,,0
7,harald2826,Harald,,,,1461301670,,,2.0,,0
11,jennifer4328,Jennifer,,,,1461302712,,,3.0,,0
...,...,...,...,...,...,...,...,...,...,...,...
2349158,rakhi,Rakhi,New York,New York,US,1342100607,Student,I care.,4.0,,0
2349162,james75291930,James,,,,1342096854,,,6.0,,0
2349164,carol8279,Carol,,,,1342099416,,,5.0,,0
2349166,eric91401545,Eric,,,,1342100719,,,2.0,,0
