In [640]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [342]:
# to get JSON files from TOS API
import urllib, json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

In [119]:
#create a list of file names to pull each json file
from os import listdir
from os.path import isfile, join
companies = [f for f in listdir('tosdr.org/api/1/service') if isfile(join('tosdr.org/api/1/service', f))]

In [784]:
# nlp imports, chose TFIDF since some words are repeated
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split

In [836]:
#model imports
from sklearn.ensemble import RandomForestClassifier
from sklearn.ensemble import GradientBoostingClassifier

# Create Master DF

## Data Cleaning

All of the data for ToS;DR is stored in a directory of different json files, each labeled with a different company name. 

In [244]:
#Pull out the data 
data_list = []

for company in companies:
    with open(f'tosdr.org/api/1/service/{company}') as json_data:
        data_list.append(json.load(json_data))

In [654]:
#Create main dataframe, drop unnecessary columns, expand list of lists

pd_list = []
for data in data_list:
    pd_list.append(pd.DataFrame.from_dict(data['pointsData']).T.loc[:,['id','quoteText','services','title','tosdr']])

alldata_df = pd.concat(pd_list,axis=0).reset_index()
todsr_df = json_normalize(alldata_df['tosdr'])
df = pd.concat((alldata_df,todsr_df),axis=1)
df = df.drop('tosdr',axis=1).explode('services')

In [655]:
#create master document column with tldr as base and quote text as secondar

df['document'] = df['tldr']
df['document'] = np.where(df['document'] == '',df['quoteText'],df['document'])
df['document'] = np.where(df['document'] == 'Generated through the annotate view',df['quoteText'],df['document'])
df['document'].fillna(df['quoteText'])

#if there is no text in the tldr or quoteText column, drop and reset index
df.dropna(axis=0,subset=['document'],inplace=True)
df = df.reset_index(drop=True)

In [672]:
#create labels, the point is really to distinquish "bad" language so labeled 
#both good and neutral as a success

df['label'] = df['point']
df.label = df.label.replace(['bad','neutral','good'],['bad','good','good'])

In [673]:
df

Unnamed: 0,index,id,quoteText,services,title,binding,case,point,privacyRelated,score,tldr,sources,irrelevant,reason,tmp_rating,document,label
0,4411,4411,"Signal does not sell, rent or monetize your pe...",signal,This service does not sell your personal data,True,This service does not sell your personal data,good,True,25,"The terms of service state that ""Signal does n...",,,,,"The terms of service state that ""Signal does n...",good
1,4412,4412,You must be at least 13 years old to use our S...,signal,You must be at least 13 years old to use the s...,True,This service is only available to users of a c...,neutral,,15,"The terms of service state: ""You must be at le...",,,,,"The terms of service state: ""You must be at le...",good
2,4414,4414,You agree to resolve any Claim you have with u...,signal,Court of law is in California,True,The court of law governing the terms is in loc...,neutral,,0,"The terms of service state: ""You agree to reso...",,,,,"The terms of service state: ""You agree to reso...",good
3,4415,4415,"We may modify, suspend, or terminate your acce...",signal,The service can delete your account without pr...,True,The service can delete your account without pr...,bad,,60,"The terms of service state: ""We may modify, su...",,,,,"The terms of service state: ""We may modify, su...",bad
4,4416,4416,We work with third parties to provide some of ...,signal,Third parties may be involved in operating the...,True,Third parties may be involved in operating the...,bad,,15,"The privacy policy states: ""We work with third...",,,,,"The privacy policy states: ""We work with third...",bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,4523,4523,,tos;dr,The terms for ToS;DR are easy to read,True,The terms for this service are easy to read,good,,15,"Altogether, ToS and PP only make up eight line...",,,,,"Altogether, ToS and PP only make up eight line...",good
2556,4524,4524,Nothing here should be considered legal advice...,tos;dr,ToS;DR gives no guarantee regarding quality,True,This services gives no guarantee regarding qua...,bad,,20,"The ToS state: ""Nothing here should be conside...",,,,,"The ToS state: ""Nothing here should be conside...",bad
2557,7679,7679,We do not use cookies or any other tracking te...,tos;dr,This service does not track you,True,This service does not track you,good,,100,Generated through the annotate view,,,,,We do not use cookies or any other tracking te...,good
2558,7683,7683,By contributing to this project (e.g.\nby send...,tos;dr,"If you offer suggestions to the service, they ...",True,"If you offer suggestions to the service, they ...",good,,50,Generated through the annotate view,,,,,By contributing to this project (e.g.\nby send...,good


In [541]:
#random lookup to check data

df.loc[np.random.randint(1,2564,5),['quoteText','tldr','document']]

Unnamed: 0,quoteText,tldr,document
1482,Cookies are small data files that are commonly...,This is probably more of a functional issue th...,This is probably more of a functional issue th...
582,"As we continue to develop our business, we mig...",Amazon may sell user data as part of a busines...,Amazon may sell user data as part of a busines...
2174,"Don’t use any kind of software, device or meth...",Generated through the annotate view,"Don’t use any kind of software, device or meth..."
735,We use Google Analytics which is a web analyti...,Generated through the annotate view,We use Google Analytics which is a web analyti...
662,In some circumstances and to the extent permit...,Generated through the annotate view,In some circumstances and to the extent permit...


In [575]:
#the land of sad documents
df.document[0],df.document[722]

('The terms of service state that "Signal does not sell, rent or monetize your personal data or content in any way – ever."',
 'Last updated: June 07, 2019')

# First Shitty Model 
### Start taking the parts of the dataframe that I want to keep

In [734]:
'''create a list of clean companies that can be used as stop words - use 
springcleaning function to format them in a similar way to the rest of 
tfiddy
'''
dirty_companies = [company.split('.') for company in companies]
clean_companies = [springcleaning(company[0]) for company in dirty_companies]
cleanest_companies = [company for sublist in clean_companies for company in sublist]

In [736]:
cleanest_companies

['signal',
 'musik',
 'sammler',
 'whatismyip',
 'com',
 'email',
 'cz',
 'upcloud',
 'customink',
 'brilliant',
 'pure',
 'zenimaxmediainc',
 'qwant',
 'virtbiz',
 'any',
 'do',
 'visible',
 'sprint',
 'stackoverflow',
 'symantec',
 'gitlab',
 'airbnb',
 'websaver',
 'mcdonald',
 'kitsu',
 'meetup',
 'weebly',
 'pexgle',
 'apple',
 'web',
 'de',
 'dr',
 'mcdougall',
 'shealth',
 'medicalcenter',
 'newegg',
 'com',
 'yelp',
 'carfax',
 'jetbrains',
 'diytubevideocommunity',
 'windowslogicproductions',
 'virgin',
 'discogs',
 'digitaladvertisingplatform',
 'reklamstore',
 'cnn',
 'forbes',
 'npm',
 'w3schools',
 'etesync',
 'librarything',
 'wikimedia',
 'algolia',
 'imdb',
 'freecodecamp',
 'steam',
 'crunchyroll',
 'reputation',
 'coursehero',
 'vox',
 'osu',
 'nordvpn',
 'myspace',
 'tellonym',
 'goguardian',
 'quora',
 'lastpass',
 'mewe',
 'chilliapps',
 'abandonmentprotector',
 'ancestry',
 'pythonanywhere',
 'sonic',
 'net',
 'wikia',
 'allrecipes',
 'kongregate',
 'moddb',
 'com

In [661]:
'''uses a method in tfidf to explore the initial clean up step - lowering 
case, removing punctuation, and splitting words'''

tfiddy = TfidfVectorizer()
tfiddy.fit(X_train)
springcleaning = tfiddy.build_analyzer()
before = df['document'][1]
after = springcleaning(df['document'][1])
print(f'Before clean up: \n {before}\n')
print(f'After clean up: \n {after}\n')

Before clean up: 
 The terms of service state: "You must be at least 13 years old to use our Services. The minimum age to use our Services without parental approval may be higher in your home country."

After clean up: 
 ['the', 'terms', 'of', 'service', 'state', 'you', 'must', 'be', 'at', 'least', '13', 'years', 'old', 'to', 'use', 'our', 'services', 'the', 'minimum', 'age', 'to', 'use', 'our', 'services', 'without', 'parental', 'approval', 'may', 'be', 'higher', 'in', 'your', 'home', 'country']



In [749]:
X = df['document']
y = df['label']
multiy = df['point']

In [842]:
X_good = df[df['point'] == 'good']['document']
X_neutral = df[df['point'] == 'neutral']['document']
X_bad = df[df['point'] == 'bad']['document']

In [816]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [771]:
MX_train, MX_test, My_train, My_test = train_test_split(X,multiy)

In [827]:
#tfiddy = TfidfVectorizer()
tfiddy = TfidfVectorizer(stop_words=cleanest_companies)
tfiddy.fit(X_train)
X_train_tfiddy = tfiddy.transform(X_train).toarray()
X_test_tfiddy = tfiddy.transform(X_test).toarray()

In [833]:
#countvoncount the countvectorizer
voncount = CountVectorizer(stop_words=cleanest_companies)
voncount.fit(X_train)
X_train_voncount = voncount.transform(X_train).toarray()
X_test_voncount = voncount.transform(X_test).toarray()

In [None]:
#let's use countvoncount for OTHER NEFARIOUS PURPOSES
#for person in main_characters:
#     counter = character_wc_dict[person][0]
#     person_speaks = [wc[0] for wc in main_character_wc if wc[1] == person][0]
#     defining_wc = {}
#     for word, count in counter.iteritems():
#         defining_wc[word] = (count ** 1.3)/person_speaks / (main_characters_counter[word]/any_main_character_speaks)
#     print(person)
#     top_n = nlargest(8, defining_wc.iteritems(), lambda x: x[1])
#     print([str(x[0]) for x in top_n])

# CLUSTERING??? 

In [896]:
voncount = CountVectorizer(stop_words=cleanest_companies)
voncount_all = voncount.fit_transform(df['document']).toarray()
voncount_all.shape

(2560, 5200)

In [901]:
toswords = voncount.get_feature_names()

In [899]:
all_sum = voncount_all.sum()
good_sum = voncount_all[df['point'] == 'good'].sum()
neutral_sum = voncount_all[df['point'] == 'neutral'].sum()
bad_sum = voncount_all[df['point'] == 'bad'].sum()
print(f'Count of all words: {all_sum} \nCount of words in good documents: {good_sum}\
\nCount of words in netural documents: {neutral_sum} \nCount of words in bad documents: {bad_sum}')

Count of all words: 100655 
Count of words in good documents: 24422
Count of words in netural documents: 25768 
Count of words in bad documents: 48987


In [900]:
all_count = np.sum(voncount_all,axis=0)
good_count = np.sum(voncount_all[df['point'] =='good'],axis = 0)
neutral_count = np.sum(voncount_all[df['point'] =='neutral'],axis = 0)
bad_count = np.sum(voncount_all[df['point'] =='bad'],axis = 0)

In [902]:
#atruehonor to meet such a distinguished guest
atruehonor = {}

for class_word_count,total_word_count,word in zip(good_count,all_count,toswords):
    atruehonor[word] = ((class_word_count**1.3)/total_word_count) *()
    

0 1 000
0 1 01
0 3 02
0 1 03
1 1 0303
0 5 04
2 4 05
0 3 06
1 2 07
2 5 08
0 3 09
5 16 10
3 6 100
0 1 1000
0 1 10010
0 1 10013
0 1 101
0 1 10th
1 11 11
1 1 1113
0 3 11th
3 16 12
2 2 120
1 1 123
0 115 13
3 11 14
1 9 15
0 1 156
0 30 16
1 7 17
2 46 18
0 1 18th
0 6 19
2 8 20
0 1 2005
0 1 2009
0 1 2010
0 1 2012
2 5 2013
0 1 20130830
0 4 2014
0 6 2015
2 10 2016
1 20 2017
3 69 2018
0 42 2019
0 1 205369412
1 1 206192659
0 1 206193329
0 1 20th
0 9 21
0 1 22
1 4 23
0 1 23rd
2 9 24
0 2 24th
0 13 25
1 1 256
1 1 256bit
1 3 26
0 2 26th
0 1 27
1 1 2705
0 1 27th
1 7 28
0 3 29
3 3 2fa
35 45 30
0 4 31
1 1 36209614
1 1 39
1 3 3rd
0 2 43
2 2 45
0 1 4557
0 2 468496
0 1 48
1 1 50
0 1 504
0 1 519522125107875
1 1 65
1 1 66
0 1 66123
1 1 69710
1 1 6pz
1 2 90
2 2 95
1 1 972
6 6 99
0 1 9th
1 1 __cfduid
1 1 __ga
1 1 __gid
1 1 __tawkuuid
1 1 __utma
1 1 __utmb
1 1 _hjincludedinsample
1 1 a4
0 2 aaa
2 2 aan
1 1 aanbiedt
0 6 abide
0 1 abilities
7 18 ability
13 47 able
70 256 about
1 4 aboutads
10 33 above
1 1 absolue
0

0 2 deceptive
3 8 decide
3 6 decided
2 5 decides
0 1 deciding
1 4 decision
0 1 declaration
0 1 declaratory
0 1 declared
0 1 declares
1 7 decline
1 1 decoders
1 1 dedicated
1 1 dedication
1 3 deem
1 6 deemed
0 5 deems
0 1 deep
0 10 defamatory
0 1 defame
7 13 default
0 8 defects
1 1 defence
2 54 defend
0 1 defending
0 22 defense
1 6 defenses
0 2 deficiencies
0 2 define
1 15 defined
0 2 defining
1 1 definitions
0 1 defraud
0 1 degrees
1 1 degré
0 1 deidentified
1 1 deine
0 2 delaware
4 9 delay
0 3 delays
0 1 delegate
79 119 delete
1 1 deleteaccount
26 32 deleted
0 1 deletes
6 13 deleting
15 23 deletion
0 2 deletions
1 1 delisting
3 15 deliver
0 4 delivered
0 4 delivering
1 3 delivery
0 17 demand
0 12 demands
1 1 demeurent
1 11 demographic
0 4 demographics
1 1 demonstrate
0 1 den
0 1 denial
1 1 denied
0 1 denmark
0 4 deny
2 3 depend
1 3 dependent
0 1 dependents
3 6 depending
1 2 depends
0 1 depicting
0 1 depictions
0 1 deployment
0 1 der
0 1 derartiges
2 2 derden
2 27 derivative
0 2 deriva

8 39 interests
1 1 interface
1 3 interfaces
0 5 interfere
0 1 interferes
0 1 interfering
1 1 intermédiaire
0 5 internal
0 1 internally
7 19 international
0 1 internationally
10 47 internet
0 1 interoperability
0 2 interpret
0 4 interpretation
0 4 interpreted
0 1 interrupt
0 5 interruption
1 2 interruptions
0 1 interval
0 3 intimidate
1 3 intl
9 36 into
4 6 intuitive
1 1 intégrant
0 6 invalid
0 1 invalidate
0 1 invalidation
0 1 invalidity
1 1 invariant
0 1 invastion
0 1 invented
0 2 inventory
1 8 investigate
0 1 investigating
0 7 investigation
1 5 investigations
0 1 investigatory
0 2 invisibly
1 1 invitation
0 1 invitations
1 1 invite
1 1 invited
0 2 invoice
0 1 involuntary
1 6 involve
5 19 involved
0 1 involvement
0 1 involves
0 6 involving
1 1 inzien
3 6 ios
0 1 iot
17 77 ip
0 3 ireland
0 1 irreparable
1 1 irretrievably
1 1 irreversibly
1 1 irrevocability
2 25 irrevocable
0 8 irrevocably
251 802 is
0 1 isc
3 4 isn
0 4 isp
1 2 israel
2 6 issue
1 3 issued
2 11 issues
2 2 issuing
0 1 ist

1 1 regions
14 36 register
12 20 registered
8 12 registering
1 1 registrar
5 20 registration
1 1 registrations
0 1 registrieren
2 5 regular
2 13 regularly
0 19 regulation
2 17 regulations
1 1 regulator
0 1 regulators
7 13 regulatory
0 1 reimbursement
1 10 reject
0 2 rejecting
0 1 relate
14 83 related
0 1 relates
3 46 relating
2 3 relation
0 1 relations
1 8 relationship
3 20 release
1 2 released
0 3 releases
7 33 relevant
0 12 reliability
0 9 reliable
0 1 reliablity
0 3 reliance
0 9 relief
0 1 relies
0 2 religious
0 2 religiös
0 1 relinquishment
0 1 rely
2 2 relying
7 16 remain
3 9 remaining
3 3 remains
0 1 remarketing
1 2 remarks
0 2 remedies
0 1 remedy
6 8 remember
0 2 remembering
1 1 remembers
0 2 reminders
0 1 remotely
1 5 removal
18 34 removed
0 2 removes
0 1 removing
0 1 render
0 2 rendered
0 4 renew
0 4 renewal
0 1 renewed
2 2 renseignements
10 10 rent
0 1 rented
1 1 reoccurring
0 13 reorganization
0 1 reorganizations
0 2 repair
3 6 repeat
1 1 repeated
3 3 repeatedly
0 1 repeatin

0 1 zuletzt
2 2 zullen
0 1 zum
0 4 zuora
1 1 át
3 3 économie
1 1 öffentliche
0 1 összesitett
0 3 администратор
0 1 администратора
0 2 администратору
0 1 администрации
0 1 адресу
0 1 активности
0 1 безопасностью
0 1 весь
0 1 включая
0 4 вправе
0 1 выявления
0 1 государственным
0 1 данные
0 1 данных
0 1 дата
0 1 действий
0 2 действия
0 2 действующего
0 2 деятельности
0 2 для
0 1 доказано
0 1 допустимых
0 1 достигшее
0 1 достигшим
0 2 за
0 2 законодательства
0 1 законодательство
0 2 законодательством
0 1 запроса
0 1 известно
0 1 изменения
0 4 или
0 1 имени
0 2 интеллектуальной
0 1 информации
0 2 информацию
0 1 иным
0 1 иных
0 1 исключением
0 1 исключительного
0 1 исключительную
0 1 исполнимого
0 2 использование
0 2 использованием
0 1 исследований
0 1 их
0 1 когда
0 1 компрометации
0 1 контактов
0 1 конфиденциальности
0 1 которых
0 1 лаборатория
0 2 лет
0 1 либо
0 1 лиц
0 2 лицам
0 1 лицензию
0 1 лицо
0 1 лицом
0 5 логина
0 1 маркетинговых
0 1 может
0 1 мошеннических
0 5 на
0 1 настоящего


In [814]:
X_test_tfiddy[X_test_tfiddy != 0]

array([0.09650595, 0.1519034 , 0.19719554, ..., 0.04242372, 0.16008265,
       0.02726525])

In [821]:
#first shitty model!

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=.01)
model.fit(X_train_tfiddy, y_train)
y_hat = model.predict(X_test_tfiddy)

thisisdumb = y_test + y_hat 
(thisisdumb == 'badgood').sum()
(thisisdumb == 'goodbad').sum()
model.score(X_test_tfiddy, y_test)

0.8328125

In [834]:
#count vectorizer instead of TFIDF

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB(alpha=.01)
model.fit(X_train_voncount, y_train)
y_hat = model.predict(X_test_voncount)

model.score(X_test_voncount, y_test)

0.8140625

In [823]:
#add company names as stop words, reduced accuracy is good

model = MultinomialNB(alpha=0.5)
model.fit(X_train_tfiddy, y_train)
y_hat = model.predict(X_test_tfiddy)

thisisdumb = y_test + y_hat 
(thisisdumb == 'badgood').sum()
(thisisdumb == 'goodbad').sum()
model.score(X_test_tfiddy, y_test)

0.825

In [None]:
model = MultinomialNB(alpha=0.5)
model.fit(X_train_tfiddy, y_train)
y_hat = model.predict(X_test_tfiddy)

thisisdumb = y_test + y_hat 
(thisisdumb == 'badgood').sum()
(thisisdumb == 'goodbad').sum()
model.score(X_test_tfiddy, y_test)

In [832]:
#multinomial classifier

tfiddy = TfidfVectorizer(stop_words=cleanest_companies)
tfiddy.fit(MX_train)
MX_train_tfiddy = tfiddy.transform(MX_train).toarray()
MX_test_tfiddy = tfiddy.transform(MX_test).toarray()

model = MultinomialNB(alpha=0.01)
model.fit(MX_train_tfiddy, My_train)
multiy_hat = model.predict(MX_test_tfiddy)

model.score(MX_test_tfiddy, My_test)

0.771875

In [835]:
#multinomial countvectorizer

voncount = CountVectorizer(stop_words=cleanest_companies)
voncount.fit(MX_train)
MX_train_voncount = voncount.transform(MX_train).toarray()
MX_test_voncount = voncount.transform(MX_test).toarray()

model = MultinomialNB(alpha=0.01)
model.fit(MX_train_voncount, My_train)
multiy_hat = model.predict(MX_test_voncount)

model.score(MX_test_voncount, My_test)

0.7640625