In [55]:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt

import seaborn as sns
from bs4 import BeautifulSoup

In [56]:
# Functions file for personality prediction file

import unicodedata
from nltk.tokenize import word_tokenize
import pandas as pd
from nltk.stem.snowball import SnowballStemmer
from nltk.corpus import stopwords
import string
import re
from bs4 import BeautifulSoup

REPLACE_BY_SPACE_RE = re.compile('[/(){}\\[\]\|?@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
punctuation_ = set(string.punctuation)
stopwords_ = set(stopwords.words('english'))

def remove_accents(input_str):
    nfkd_form = unicodedata.normalize('NFKD', input_str)
    only_ascii = nfkd_form.encode('ASCII', 'ignore')
    return only_ascii.decode()

def filter_tokens(sent):
    return ([w for w in sent if not w in stopwords_ and not w in punctuation_])

def remove_link(sent):
    return [s for s in sent if 'http' not in s]

def wt(text):
    return [word_tokenize(sent) for sent in text.split()]

def flatten(lst):
    return [item for sublist in lst for item in sublist]

def snow_stem(text):
    snowball = SnowballStemmer('english')
    return [snowball.stem(word) for word in text]

def rm_punc(sent):
    return [w for w in sent if not w in punctuation_]

def clean_text(text):
    text = BeautifulSoup(text, "lxml").text # HTML decoding
    text = REPLACE_BY_SPACE_RE.sub(' ', text) # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = BAD_SYMBOLS_RE.sub('', text) # delete symbols which are in BAD_SYMBOLS_RE from text
    text = ' '.join(word for word in text.split() if word not in stopwords_)
    return text

def clean_df(df):
    """Normalize the data, change all letters to lower case, split into 
    sentences,remove website links, tokenize words into a new dataframe then:
    filter punctuation, filter stopwords, stemminize
    INPUT
    --------------
    df = Pandas DataFrame
    OUTPUT
    --------------
    word = Pandas DataFrames
    ______________
    """
    newdf = df.copy()
    newdf.posts = newdf.posts.apply(lambda x: remove_accents(x).lower())
    newdf = pd.DataFrame((newdf.type,newdf.posts.apply(lambda x: x.split('|||')))).T
    newdf.posts = newdf.posts.apply(lambda x: remove_link(x))
    newdf.posts = newdf.posts.apply(lambda x: ' '.join(x))
    newdf.posts = newdf.posts.apply(lambda x: clean_text(x))
    newdf.posts = newdf.posts.apply(lambda x: wt(x))
    newdf.posts = newdf.posts.apply(lambda x: flatten(x))
    newdf.posts = newdf.posts.apply(lambda x: snow_stem(x))
    newdf.posts = newdf.posts.apply(lambda x: ' '.join(x))
    return newdf

In [57]:
df = pd.read_json('data/data.json')

In [58]:
def fraud(string):
    if 'fraud' in string:
        return True
    else:
        return False

In [59]:
df['fraud'] = df.acct_type.apply(lambda x: fraud(x))

In [60]:
df.columns

Index(['acct_type', 'approx_payout_date', 'body_length', 'channels', 'country',
       'currency', 'delivery_method', 'description', 'email_domain',
       'event_created', 'event_end', 'event_published', 'event_start',
       'fb_published', 'gts', 'has_analytics', 'has_header', 'has_logo',
       'listed', 'name', 'name_length', 'num_order', 'num_payouts',
       'object_id', 'org_desc', 'org_facebook', 'org_name', 'org_twitter',
       'payee_name', 'payout_type', 'previous_payouts', 'sale_duration',
       'sale_duration2', 'show_map', 'ticket_types', 'user_age',
       'user_created', 'user_type', 'venue_address', 'venue_country',
       'venue_latitude', 'venue_longitude', 'venue_name', 'venue_state',
       'fraud'],
      dtype='object')

In [61]:
df.head()

Unnamed: 0,acct_type,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,event_published,event_start,fb_published,gts,...,previous_payouts,sale_duration,sale_duration2,show_map,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state,fraud
0,fraudster_event,1266062400,3852,5,US,USD,0.0,"<p><a href=""http://s432.photobucket.com/albums...",gmail.com,1262739706,1265630400,1263110000.0,1265594400,0,0.0,...,[],29.0,33,1,"[{'event_id': 527017, 'cost': 25.0, 'availabil...",36,1259613950,1,717 Washington Avenue,US,25.777471,-80.133433,INK Nightclub - South Beach,FL,True
1,premium,1296720000,3499,0,US,USD,1.0,"<p>Join us for a quick, one-night, community-b...",ruf.org,1293832670,1296288000,1293833000.0,1296255600,0,868.02,...,"[{'name': 'RUF', 'created': '2010-10-01 01:10:...",28.0,28,0,"[{'event_id': 786878, 'cost': 35.0, 'availabil...",149,1280942776,3,,US,32.776566,-79.930922,"The Charleston, SC area",SC,False
2,premium,1296172800,2601,8,US,USD,1.0,"<h3><span class=""subcategory""><strong>Teacher ...",pvsd.k12.ca.us,1291090956,1295740800,1291092000.0,1295713800,0,3500.0,...,"[{'name': 'Danielle Severn', 'created': '2010-...",54.0,54,0,"[{'event_id': 787337, 'cost': 93.51, 'availabi...",214,1272559388,3,10100 Pioneer Blvd Suite 100,US,33.944201,-118.080419,Los Angeles County Office of Education,CA,False
3,premium,1388966400,12347,6,IE,EUR,1.0,"<p style=""margin-bottom: 1.3em; padding-bottom...",irishtabletennis.com,1360681570,1388534400,1360683000.0,1360702800,0,1167.35,...,"[{'name': '', 'created': '2010-11-09 01:10:15'...",0.0,0,0,"[{'event_id': 885645, 'cost': 25.0, 'availabil...",889,1283870102,3,,,,,,,False
4,premium,1297900800,2417,11,US,USD,0.0,<p>Writers and filmmakers need to understand t...,artsandbusinesscouncil.org,1291994666,1297468800,1291995000.0,1297440000,1,2313.15,...,[{'name': 'Arts and Business Council or Greate...,63.0,63,0,"[{'event_id': 1114349, 'cost': 150.0, 'availab...",35,1288984065,3,One Marina Park Drive,US,42.353848,-71.044276,Fish & Richardson,MA,False


In [62]:

df['description'] = df.description.apply(lambda x: BeautifulSoup(x).text)

In [74]:
def preprocess(text):
    text = text.str.replace('\n', '')
    text = text.str.replace('www','')
    text = text.str.replace('com','')
    text = text.str.replace('http', '')
    text = text.str.replace('00', '')
    text = text.str.replace('30', '')
    text = text.str.replace('15', '')
    text = text.str.replace('10','')
    return text
df['description'] = preprocess(df.description)

In [64]:
df.acct_type.value_counts()

premium             12373
fraudster_event       851
fraudster             437
spammer_limited       218
spammer_warn          144
tos_warn               91
spammer_noinvite       84
tos_lock               77
locked                 54
fraudster_att           5
spammer_web             2
spammer                 1
Name: acct_type, dtype: int64

In [65]:
import pandas as pd
import numpy as np
from textblob import TextBlob
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfVectorizer
import warnings 
warnings.filterwarnings('ignore')
import matplotlib
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
matplotlib.rcParams['figure.figsize'] = (10.0, 6.0)
import plotly.graph_objs as go
import chart_studio.plotly as py
# import cufflinks
pd.options.display.max_columns = 30
from IPython.core.interactiveshell import InteractiveShell
import plotly.figure_factory as ff
InteractiveShell.ast_node_interactivity = 'all'
from plotly.offline import iplot
# cufflinks.go_offline()
# cufflinks.set_config_file(world_readable=True, theme='pearl')
from sklearn.decomposition import TruncatedSVD
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.manifold import TSNE
from bokeh.plotting import figure, output_file, show
from bokeh.models import Label
from bokeh.io import output_notebook
output_notebook()
from collections import Counter
# import scattertext as st
# import spacy
from pprint import pprint
# import en_core_web_sm

In [66]:
df[df['acct_type'] == 'fraudster'].sample(10)

Unnamed: 0,acct_type,approx_payout_date,body_length,channels,country,currency,delivery_method,description,email_domain,event_created,event_end,event_published,event_start,fb_published,gts,...,previous_payouts,sale_duration,sale_duration2,show_map,ticket_types,user_age,user_created,user_type,venue_address,venue_country,venue_latitude,venue_longitude,venue_name,venue_state,fraud
14131,fraudster,1331895600,455,5,HR,USD,0.0,MARTINI BAND LIVE10.3.2012. @ GRIMZOC BAR BESP...,gmail.com,1330881173,1331463600,1330881000.0,1331438400,0,219.26,...,[],6.0,6,1,"[{'event_id': 3073697, 'cost': 10.0, 'availabi...",0,1330881172,1,Valentina Vodnika,HR,45.805414,15.966844,,City of Zagreb,True
8001,fraudster,1238904000,1857,6,GB,GBP,,VERY FEW TICKETS REMAININGBOOK NOW TO AVOID DI...,lidf.co.uk,1236452739,1238472000,1236454000.0,1238461200,0,0.0,...,"[{'name': '', 'created': '2011-06-16 01:11:28'...",23.0,23,1,"[{'event_id': 301647, 'cost': 10.0, 'availabil...",1044,1146260369,3,30 Colonnade,GB,51.522928,-0.12398,The Horse Hospital,"London, City Of",True
8658,fraudster,1334358000,1044,5,US,USD,0.0,Manno Bernini Garment Company will be hosting ...,aol.com,1333211180,1333926000,1333212000.0,1333742400,0,752.02,...,[],6.0,6,1,"[{'event_id': 3260195, 'cost': 280.0, 'availab...",0,1333211179,1,3770 Las Vegas Blvd.,US,36.104189,-115.173217,Monte Carlo Resort & Casino - Grand Ballroom &...,NV,True
9582,fraudster,1314838800,261,8,US,USD,0.0,4 Rounds of Game/30 minutesEF: $85 ($80 LACC m...,ymail.com,1314255958,1314406800,1314256000.0,1314399600,0,362.68,...,[],1.0,1,0,"[{'event_id': 2087871, 'cost': 85.0, 'availabi...",14,1313079424,1,196 Chasepointe Drive,US,36.340033,-86.645186,Absolution Chess Club,TN,True
10608,fraudster,1324699200,1937,11,US,USD,0.0,YAPPY HOLIDAYS!SANTA PAWS fido-friendly holida...,yahoo.com,1322675758,1324267200,1322676000.0,1324260000,1,44.18,...,"[{'name': 'C.Cornish', 'created': '2011-10-30 ...",18.0,18,1,"[{'event_id': 2575906, 'cost': 20.0, 'availabi...",62,1317308021,1,411 North New River Drive East,US,26.117985,-80.138447,Cruise Along The New River,FL,True
10507,fraudster,1302217200,18156,5,US,USD,0.0,VIP FREE TICKET OFFER BUY 1 VIP GET 1 FREE...B...,att.net,1290457125,1301785200,1290708000.0,1299963600,0,32.64,...,"[{'name': '', 'created': '2011-04-07 01:13:53'...",107.0,110,0,"[{'event_id': 1067787, 'cost': 30.0, 'availabi...",0,1290457124,3,,,0.0,0.0,TBA,,True
1509,fraudster,1295222400,157,0,US,USD,0.0,I invite all to the party. I promise that will...,yahoo.com,1294519550,1294790400,1294520000.0,1294736400,0,794.16,...,[],3.0,3,0,"[{'event_id': 1182843, 'cost': 0.0, 'availabil...",0,1294519549,1,8840 Cowenton Ave,US,39.388104,-76.440374,good party,MD,True
11610,fraudster,1325851200,726,6,US,USD,0.0,5hr OPEN bar from 9:30pm to 2:30am!Hors d'oeu...,gmail.com,1323719792,1325419200,1323990000.0,1325394000,1,4705.25,...,"[{'name': 'Midtown Bar & Restaurant LLC', 'cre...",16.0,19,1,"[{'event_id': 2629315, 'cost': 100.0, 'availab...",1017,1235846234,4,986 2nd Avenue,US,40.756154,-73.967236,Traffic,NY,True
11123,fraudster,1329958800,428,6,US,USD,0.0,Hello. I the chess player and I have decided t...,yahoo.com,1329336165,1329526800,1329337000.0,1329512400,0,805.76,...,"[{'name': '', 'created': '2012-03-07 03:12:33'...",2.0,2,1,"[{'event_id': 2961995, 'cost': 190.0, 'availab...",14,1328136990,1,10768 Seacliff Circle,US,26.397105,-80.216053,,FL,True
7544,fraudster,1305802800,131,5,US,USD,0.0,Get your pre-memorial day party started with G...,yahoo.com,1305143128,1305370800,1305144000.0,1305342000,0,5031.62,...,[],2.0,2,0,"[{'event_id': 1675347, 'cost': 600.0, 'availab...",0,1305143124,1,724 Nostrand Ave,US,40.673903,-73.950235,Secrets Night Clun,NY,True


In [67]:
df.columns

Index(['acct_type', 'approx_payout_date', 'body_length', 'channels', 'country',
       'currency', 'delivery_method', 'description', 'email_domain',
       'event_created', 'event_end', 'event_published', 'event_start',
       'fb_published', 'gts', 'has_analytics', 'has_header', 'has_logo',
       'listed', 'name', 'name_length', 'num_order', 'num_payouts',
       'object_id', 'org_desc', 'org_facebook', 'org_name', 'org_twitter',
       'payee_name', 'payout_type', 'previous_payouts', 'sale_duration',
       'sale_duration2', 'show_map', 'ticket_types', 'user_age',
       'user_created', 'user_type', 'venue_address', 'venue_country',
       'venue_latitude', 'venue_longitude', 'venue_name', 'venue_state',
       'fraud'],
      dtype='object')

In [68]:
def get_top_n_words(corpus, n=None):
    vec = CountVectorizer(stop_words='english').fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0,idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key= lambda x: x[1], reverse=True)
    return words_freq[:n]

In [69]:
# top 20 words before removing stop words
common_words = get_top_n_words(df['description'], 20)
for word, freq in common_words:
    print(word, freq)

event 8852
new 6286
com 6113
business 5978
00 5526
20 5011
time 4979
tickets 4967
30 4698
day 4383
www 4103
ticket 3938
10 3918
learn 3371
join 3359
make 3350
course 3317
people 3266
available 3186
information 3177


In [71]:
df.description

0         Party Starz Entertaintment & Diverse Internat...
1        Join us for a quick, one-night, community-buil...
2        Teacher Training on the Desired Results Develo...
3        Affiliations are now due in respect of the 201...
4        Writers and filmmakers need to understand thei...
5        Come join the Bluegrass Stallions as they take...
6                                                         
7        WHEN JEKYLL MET HYDEThe Magnetic Theatre in Th...
8        Merchants Exchange Productions and the Julia M...
9        WHEN JEKYLL MET HYDEThe Magnetic Theatre in Th...
11       monkeyface eel champion, blogger and street pe...
12       You’ve got a profile on LinkedIn – Now What?Th...
13       Enchantment: The Art of Changing Hearts, Minds...
14       Top celebrity & editorial stylists Ilaria Urbi...
15       "An education film that gets it"Washington Pos...
16       "An education film that gets it"Washington Pos...
17                                                      

In [76]:
# top 20 words after removing stop words
common_words1 = get_top_n_words(df['description'], 20)
for word, freq in common_words1:
    print(word, freq)

event 8854
new 6288
business 5985
20 5769
pm 5597
time 4985
tickets 4981
day 4390
ticket 3942
learn 3373
join 3369
make 3351
course 3319
people 3267
available 3187
information 3179
bring 3106
class 3100
year 2974
workshop 2973


In [78]:
common_words_fraud = get_top_n_words(df['description'][df['fraud']==True], 20)
for word, freq in common_words_fraud:
    print(word, freq)

event 373
tickets 316
et 304
party 272
time 251
new 249
la 245
help 238
le 230
course 227
les 207
du 195
pm 188
ticket 188
night 184
des 182
best 181
day 180
world 178
free 174


In [79]:
common_words_clean = get_top_n_words(df['description'][df['fraud']==False], 20)
for word, freq in common_words_clean:
    print(word, freq)

event 8481
new 6039
business 5851
20 5640
pm 5409
time 4734
tickets 4665
day 4210
ticket 3754
join 3276
learn 3271
make 3229
information 3097
people 3095
course 3092
available 3034
class 2995
bring 2945
workshop 2836
year 2831


In [52]:
# need to remove all URL
# df.description = df.description.apply(lambda x: remove_link(x))

In [77]:
def get_top_n_bigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(2, 2)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0)
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq = sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [84]:
common_bigrams = get_top_n_bigram(df['description'], 20)
for word, freq in common_bigrams:
    print(word, freq)

of the 12829
will be 9447
in the 8493
to the 6234
at the 6228
on the 5150
for the 4973
if you 4165
and the 3875
how to 3749
you will 2846
to be 2779
with the 2729
the event 2712
you can 2522
from the 2433
is the 2229
we will 2221
20 20 2135
you are 2099


In [83]:
common_bigrams_fraud = get_top_n_bigram(df['description'][df['fraud']==True], 20)
for word, freq in common_bigrams_fraud:
    print(word, freq)

of the 558
will be 500
in the 435
to the 310
for the 227
at the 217
on the 196
how to 156
if you 145
and the 139
to be 131
with the 120
you will 116
the event 116
this is 113
is the 113
can be 104
you can 101
to help 100
we will 98


In [85]:
common_bigrams_clean = get_top_n_bigram(df['description'][df['fraud']==False], 20)
for word, freq in common_bigrams_clean:
    print(word, freq)

of the 12271
will be 8947
in the 8058
at the 6011
to the 5924
on the 4954
for the 4746
if you 4020
and the 3736
how to 3593
you will 2730
to be 2648
with the 2609
the event 2596
you can 2421
from the 2343
20 20 2133
we will 2123
is the 2116
you are 2019


In [87]:
def get_top_n_trigram(corpus, n=None):
    vec = CountVectorizer(ngram_range=(3, 3)).fit(corpus)
    bag_of_words = vec.transform(corpus)
    sum_words = bag_of_words.sum(axis=0) 
    words_freq = [(word, sum_words[0, idx]) for word, idx in vec.vocabulary_.items()]
    words_freq =sorted(words_freq, key = lambda x: x[1], reverse=True)
    return words_freq[:n]

In [88]:
common_trigrams = get_top_n_trigram(df['description'], 20)
for word, freq in common_trigrams:
    print(word, freq)

20 20 20 1558
as well as 1178
one of the 1015
for more information 966
join us for 961
at the door 925
if you have 808
if you are 727
learn how to 709
there will be 703
some of the 552
to the event 533
you will be 528
be able to 514
0a 20 20 511
will be available 489
would like to 464
you need to 462
we will be 461
will be provided 449


In [91]:
common_trigrams_fraud = get_top_n_trigram(df['description'][df['fraud']==True], 20)
for word, freq in common_trigrams_fraud:
    print(word, freq)

as well as 55
will be available 48
one of the 40
there will be 39
if you have 36
will help you 35
to help you 34
for more info 33
would like to 29
this event is 28
we will be 27
look at how 27
food and wine 27
you will be 26
some of the 26
be able to 26
your personal taste 26
interesting look at 26
feel free to 25
learn how to 25


In [96]:
common_trigrams_clean = get_top_n_trigram(df['description'][df['fraud']==False], 20)
for word, freq in common_trigrams_clean:
    print(word, freq)

20 20 20 1558
as well as 1123
one of the 975
for more information 946
join us for 943
at the door 910
if you have 772
if you are 709
learn how to 684
there will be 664
some of the 526
to the event 519
0a 20 20 511
you will be 502
be able to 488
you need to 443
will be available 441
will be provided 436
would like to 435
we will be 434


In [None]:
df.loc[df[]]