In [24]:
import pandas as pd
import os
from nltk.corpus import stopwords
import nltk.data
import logging
import time
from datetime import datetime
import numpy as np  # Make sure that numpy is imported
from gensim.models import Word2Vec
from sklearn.cluster import KMeans
from sklearn.ensemble import RandomForestClassifier

from KaggleWord2VecUtility import KaggleWord2VecUtility
import json
import cPickle as pickle
import re
from operator import itemgetter

In [166]:
# whocalled has 171660 with comments among 555146 records
#scrapy_data = '/home/fujun/phone-data/whocalled.json'
# callercomplaints has 184245 unique phones with comments among 188130 records
data_root = '/home/fujunl/phone-scraping-data/'
scrapy_data = data_root + 'callercomplaints.json'
phone_stat = dict()
caller_type = dict()
caller_type_short = {u'Debt Collector': 0, u'Unknown': 1, u'Prank Caller': 2, 
                     u'Political Call': 3, u'Telemarketer': 4}

year_min = 2008
time_slots = 12*(2016 - year_min + 1)
default_type_dist = {u'Debt Collector': 0, u'Unknown': 0, u'Prank Caller': 0, u'Political Call': 0, u'Telemarketer': 0}

caller_type_time = np.zeros((len(caller_type_short), time_slots))
with open(scrapy_data) as f:
    data = json.load(f)
    for record in data:
        phone_id = record['phone_id'].split('-')[0]
        if phone_id not in phone_stat:
            phone_stat[phone_id] = {'n_comments':0, 'time_dist':np.zeros(time_slots), 'type_dist':dict(default_type_dist)}
            
        for comment in record['phone_comments']:
            if comment:
                phone_stat[phone_id]['n_comments'] += 1
                                   
                time_str = re.search('[0-9]+/[0-9]+/[0-9]+', comment['time']).group()
                call_time = datetime.strptime(time_str, '%m/%d/%Y')
                slot_id = 12*(call_time.year - year_min) + call_time.month - 1
                phone_stat[phone_id]['time_dist'][slot_id] += 1
                
                #year_min = min(year_min, call_time.year)
                tmp_type = comment['caller_type']
                if tmp_type in caller_type:
                    caller_type[tmp_type] += 1
                else:
                    caller_type[tmp_type] = 1
                phone_stat[phone_id]['type_dist'][tmp_type] += 1
                
                caller_type_time[caller_type_short[tmp_type]][slot_id] += 1

#with open('callercomplaints_stat.p', 'wb') as fp:
    #pickle.dump(phone_stat, fp)

phone_ncomments = [(phone_id, phone_stat[phone_id]['n_comments']) for phone_id in phone_stat.keys()]
with open('callercomplaints_ncomments.p', 'wb') as fp:
    pickle.dump(phone_ncomments, fp)
    
sorted_phones = sorted(phone_ncomments, key=itemgetter(1), reverse=True)
print len(sorted_phones)
for stat in sorted_phones[:10]:
    print stat[0], stat[1]

184245
5125025384 4141
5617849844 1409
7636576794 1246
2519478966 1098
3864276893 1061
4165231121 964
9042594351 959
2016215695 944
3304922913 899
8152357297 876


In [167]:
from collections import Counter
num_comments = [phone_stat[phone]['n_comments'] for phone in phone_stat.keys()]

print np.max(num_comments),np.min(num_comments),np.median(num_comments)
call_counter = Counter(num_comments)
ncomplaints_freq = np.array(call_counter.values(),'float')
ncomplaints_sum = np.sum(ncomplaints_freq)
ncomplaints_prob = ncomplaints_freq/ncomplaints_sum
complaints_dist = np.zeros(7,'float')
xticks = ['1', '[2,5]','[6,10]','[11,50]','[51,100]','>100']
complaints_dist[0] = np.sum(ncomplaints_prob[0])
complaints_dist[1] = np.sum(ncomplaints_prob[1:5])
complaints_dist[2] = np.sum(ncomplaints_prob[5:10])
complaints_dist[3] = np.sum(ncomplaints_prob[10:50])
complaints_dist[4] = np.sum(ncomplaints_prob[50:100])
complaints_dist[5] = np.sum(ncomplaints_prob[100:])

print np.sum(complaints_dist),complaints_dist
fig = plt.figure()
ax = fig.add_subplot(111)
x = np.arange(len(complaints_dist))
plt.ylim([0,1.0])
plt.step(x,complaints_dist, where='post', linewidth=3.0)
for i in range(len(complaints_dist)-1):
    ax.text(i+0.2,complaints_dist[i]+0.02, xticks[i])
plt.setp(ax.get_xticklabels(), visible=False)
plt.yticks(np.arange(0, 1, 0.05))
plt.xlabel('# of Complaints')
plt.ylabel('Prob. Distribution of Complaint Numbers')
plt.title('Phones in callercomplaints.com')
plt.show()
fig.savefig('callercomplaints_complaint_prob.png')

4141 1 1.0
1.0 [ 0.69272979  0.19470813  0.04909767  0.05292952  0.0064208   0.00411409
  0.        ]


In [76]:
import matplotlib.pyplot as plt

caller_type_lst = ['Debt Collector', 'Prank Caller', 
                     'Political Call','Telemarketer']

caller_type_time4 = np.delete(caller_type_time, 1, axis=0)
labels = [None] * len(caller_type_lst)
x = np.arange(time_slots)
cutoff = time_slots-12+2
fig = plt.figure()
ax = fig.add_subplot(111)
for i in range(len(caller_type_lst)):
    #print caller_type_lst[i]
    labels[i], = plt.plot(x[:cutoff], caller_type_time4[i][:cutoff], '-', linewidth=3.0)

curr_year = year_min
for i in range(11,time_slots,12):
    print i
    ax.text(i-10.5, 6000, curr_year)
    curr_year += 1
    plt.plot([i,i], [0,6000], 'r-', linewidth=3.0)
    
plt.legend(labels, caller_type_lst)
plt.title('Time Distribution of Different Caller Types')
plt.show()
fig.savefig('call_type_time.png')

11
23
35
47
59
71
83
95
107


In [72]:
import matplotlib.pyplot as plt
nyears = 2016 - year_min + 1

x = np.arange(time_slots)
fig = plt.figure()
ax = fig.add_subplot(111)
cutoff = time_slots-12+2
#plt.plot(x[:cutoff], caller_type_time[0][:cutoff], '-', linewidth=3.0)
#plt.plot(x[:cutoff], caller_type_time[2][:cutoff], '-', linewidth=3.0)
plt.plot(x[:cutoff], caller_type_time[3][:cutoff], '-', linewidth=3.0)
# plot year
curr_year = year_min
for i in range(11,time_slots,12):
    print i
    ax.text(i-10.5, 300, curr_year)
    curr_year += 1
    plt.plot([i,i], [0,300], 'r-', linewidth=3.0)
plt.title('Time Distribution of Politica Calls')
plt.show()
fig.savefig('call_type_time_politica.png')

11
23
35
47
59
71
83
95
107


In [58]:
# load data and do analysis
#with open('callercomplaints_stat.p', 'rb') as fp:
    #phone_stat = pickle.load(fp)
    
phone_ncomments = [(phone_id, phone_stat[phone_id]['n_comments']) for phone_id in phone_stat.keys()]
sorted_phones = sorted(phone_ncomments, key=itemgetter(1), reverse=True)
print len(sorted_phones)
for stat in sorted_phones[:1]:
    print stat[0], stat[1]
    print phone_stat[stat[0]]['n_comments']

184245
5125025384 4141
4141


In [164]:
# whocalled has 171660 with comments among 555146 records
#scrapy_data = '/home/fujun/phone-data/whocalled.json'
# callercomplaints has 184245 unique phones with comments among 188130 records
data_root = '/home/fujunl/phone-scraping-data/'
scrapy_data = data_root + 'whocalled.json'
phone_stat = dict()
year_min = 2005
time_slots = 12*(2016 - year_min + 1)
location_stat = dict()
with open(scrapy_data) as f:
    data = json.load(f)
    for record in data:
        if not record['comments']:
            continue
        phone_id = record['phone']
        if phone_id not in phone_stat:
            phone_stat[phone_id] = {'n_comments':0, 'time_dist':np.zeros(time_slots)}
        for comment in record['comments']:
            if comment:
                if comment['location']:
                    location = comment['location'][0].strip().lower()
                    loc_index = 0
                    for m in re.finditer('[0-9]+', location):
                        loc_index = max(loc_index, m.span()[1])
                    location = location[loc_index:].strip()
                    
                    #if location1 != location:
                        #print location, location1
                        
                    if location in location_stat:
                        location_stat[location] = location_stat[location]+1
                    else:
                        location_stat[location] = 1
                    
                phone_stat[phone_id]['n_comments'] += 1
                #print str(comment['time'])
                call_time = datetime.strptime(comment['time'][0], '%Y-%m-%d %H:%M:%S')
                slot_id = 12*(call_time.year - year_min) + call_time.month - 1
                phone_stat[phone_id]['time_dist'][slot_id] += 1

#with open('whocalled_stat.p', 'wb') as fp:
    #pickle.dump(phone_stat, fp)
    
#with open('whocalled_location_stat.p', 'wb') as fp:
    #pickle.dump(location_stat, fp)

phone_ncomments = [(phone_id, phone_stat[phone_id]['n_comments']) for phone_id in phone_stat.keys()]
with open('whocalled_ncomments.p', 'wb') as fp:
    pickle.dump(phone_ncomments, fp)
    
sorted_phones = sorted(phone_ncomments, key=itemgetter(1), reverse=True)
print len(sorted_phones)
for stat in sorted_phones[:10]:
    print stat[0], stat[1]

171660
0000000000 1315
8009681875 895
8592121501 884
8004461022 826
2013738371 785
8009184224 783
5132176631 783
3239271550 669
6136883625 645
8006790336 643


In [165]:
from collections import Counter
num_comments = [phone_stat[phone]['n_comments'] for phone in phone_stat.keys()]

print np.max(num_comments),np.min(num_comments),np.median(num_comments)
call_counter = Counter(num_comments)
ncomplaints_freq = np.array(call_counter.values(),'float')
ncomplaints_sum = np.sum(ncomplaints_freq)
ncomplaints_prob = ncomplaints_freq/ncomplaints_sum
complaints_dist = np.zeros(7,'float')
xticks = ['1', '[2,5]','[6,10]','[11,50]','[51,100]','>100']
complaints_dist[0] = np.sum(ncomplaints_prob[0])
complaints_dist[1] = np.sum(ncomplaints_prob[1:5])
complaints_dist[2] = np.sum(ncomplaints_prob[5:10])
complaints_dist[3] = np.sum(ncomplaints_prob[10:50])
complaints_dist[4] = np.sum(ncomplaints_prob[50:100])
complaints_dist[5] = np.sum(ncomplaints_prob[100:])

print np.sum(complaints_dist),complaints_dist
fig = plt.figure()
ax = fig.add_subplot(111)
x = np.arange(len(complaints_dist))
plt.ylim([0,1.0])
plt.step(x,complaints_dist, where='post', linewidth=3.0)
for i in range(len(complaints_dist)-1):
    ax.text(i+0.2,complaints_dist[i]+0.02, xticks[i])
plt.setp(ax.get_xticklabels(), visible=False)
plt.yticks(np.arange(0, 1, 0.05))
plt.xlabel('# of Complaints')
plt.ylabel('Prob. Distribution of Complaint Numbers')
plt.title('Phones in whocalled.us')
plt.show()
fig.savefig('whocalled_complaint_prob.png')

1315 1 1.0
1.0 [ 0.6665152   0.227036    0.04773389  0.04806012  0.00661773  0.00403705
  0.        ]


In [75]:
phone_ncomments = [(phone_id, phone_stat[phone_id]['n_comments']) for phone_id in phone_stat.keys()]
sorted_phones = sorted(phone_ncomments, key=itemgetter(1), reverse=True)
print len(sorted_phones)
for stat in sorted_phones[:10]:
    print stat[0], stat[1]
    #print phone_stat[stat[0]]['n_comments']

171660
0000000000 1315
8009681875 895
8592121501 884
8004461022 826
2013738371 785
8009184224 783
5132176631 783
3239271550 669
6136883625 645
8006790336 643


In [99]:
print location_stat['ghana']

11


In [139]:
from geopy.geocoders import Nominatim
geolocator = Nominatim()
location_geo = dict()
sidemap = {'la':'Los Angeles,California', 'nt canada':'Northwest Territories,canada', 
        'virgin islands, u.s.':'virgin islands', 'il':'Illinois'}
for addr in location_stat.keys():
    try:
        lat_long = geolocator.geocode(addr)
    except GeocoderTimedOut as e:
        print("Error: geocode failed on input {} with message {}".format(addr, e.msg))
    
    if not lat_long:
        print 'geo for {} not found'.format(addr)
        if addr in mymap:
            lat_long = geolocator.geocode(mymap[addr])
            if not lat_long:
                print 'geo for {} NOT found even using {}'.format(addr,mymap[addr])
            else:
                print 'geo for {} found using {}'.format(addr,mymap[addr])
        else:
            print 'I don not know how to find {}'.format(addr)
    
    #print '{}:{}'.format(addr, str(lat_long))
    if lat_long:
        location_geo[addr] = {'latitude':lat_long.latitude, 
                              'longitude':lat_long.longitude, 'n_called':location_stat[addr]}

print len(location_geo), len(location_stat)
with open('location_geo.p', 'wb') as fp:
    pickle.dump(location_geo, fp)

geo for la not found
geo for la found using Los Angeles,California
geo for virgin islands, u.s. not found
geo for virgin islands, u.s. found using virgin islands
geo for nt canada not found
geo for nt canada found using Northwest Territories,canada
geo for satellite provider not found
I don not know how to find satellite provider
geo for il not found
geo for il found using Illinois
224 225


In [16]:
import matplotlib.pyplot as plt
from mpl_toolkits.basemap import Basemap
with open('location_geo.p', 'rb') as fp:
    location_geo = pickle.load(fp)

fig = plt.figure()
themap = Basemap(projection='gall', resolution = 'l', area_thresh = 100000.0)
themap.drawcoastlines()
themap.drawcountries()
themap.fillcontinents(color = 'gainsboro')
themap.drawmapboundary(fill_color='steelblue')

lons = [location['longitude'] for location in location_geo.values()]
lats = [location['latitude'] for location in location_geo.values()]
called = np.array([location['n_called'] for location in location_geo.values()])
mark_sz = 3*((called // 10000) + 1)

x, y = themap(lons, lats)
for i in range(len(x)):
    themap.plot(x[i], y[i], 'o', color='Red', markersize=mark_sz[i])

plt.title('Geographical Distribution of Scams Received')
plt.show()

fig.savefig('call.png')

In [163]:
with open('callercomplaints_ncomments.p', 'rb') as fp:
    cc_ncomments = pickle.load(fp)

with open('whocalled_ncomments.p', 'rb') as fp:
    wc_ncomments = pickle.load(fp)

cc_phones = set([entry[0] for entry in cc_ncomments])
wc_phones = set([entry[0] for entry in wc_ncomments])

cc_dict = dict(cc_ncomments)
wc_dict = dict(wc_ncomments)
common_phones = cc_phones.intersection(wc_phones)
common_phone_comments = np.array([cc_dict[com_ph] + wc_dict[com_ph] for com_ph in common_phones])
print len(common_phone_comments), np.max(common_phone_comments),np.min(common_phone_comments), np.median(common_phone_comments)


from collections import Counter
num_comments = common_phone_comments

print np.max(num_comments),np.min(num_comments),np.median(num_comments)
call_counter = Counter(num_comments)
ncomplaints_freq = np.array(call_counter.values(),'float')
ncomplaints_sum = np.sum(ncomplaints_freq)
ncomplaints_prob = ncomplaints_freq/ncomplaints_sum
complaints_dist = np.zeros(7,'float')
xticks = ['2', '[3,5]','[6,10]','[11,50]','[51,100]','>100']
complaints_dist[0] = np.sum(ncomplaints_prob[0])
complaints_dist[1] = np.sum(ncomplaints_prob[1:4])
complaints_dist[2] = np.sum(ncomplaints_prob[4:9])
complaints_dist[3] = np.sum(ncomplaints_prob[10:50])
complaints_dist[4] = np.sum(ncomplaints_prob[50:100])
complaints_dist[5] = np.sum(ncomplaints_prob[100:])

print np.sum(complaints_dist),complaints_dist
fig = plt.figure()
ax = fig.add_subplot(111)
x = np.arange(len(complaints_dist))
plt.ylim([0,1.0])
plt.step(x,complaints_dist, where='post', linewidth=3.0)
for i in range(len(complaints_dist)-1):
    ax.text(i+0.2,complaints_dist[i]+0.02, xticks[i])
plt.setp(ax.get_xticklabels(), visible=False)
plt.yticks(np.arange(0, 1, 0.05))
plt.xlabel('# of Complaints')
plt.ylabel('Prob. Distribution of Complaint Numbers')
plt.title('Phones in Both Dataset')
plt.show()
fig.savefig('common_complaint_prob.png')

31045 1608 2 6.0
1608 2 6.0
0.980641004993 [ 0.29895313  0.19922693  0.14975036  0.24300209  0.04941214  0.04029634
  0.        ]
