In [640]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


In [342]:
# to get JSON files from TOS API
import urllib, json
import pandas as pd
import numpy as np
from pandas.io.json import json_normalize

In [119]:
#create a list of file names to pull each json file
from os import listdir
from os.path import isfile, join
companies = [f for f in listdir('tosdr.org/api/1/service') if isfile(join('tosdr.org/api/1/service', f))]

In [638]:
# nlp imports, chose TFIDF since some words are repeated
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split

# Create Master DF

## Data Cleaning

All of the data for ToS;DR is stored in a directory of different json files, each labeled with a different company name. 

In [244]:
#Pull out the data 
data_list = []

for company in companies:
    with open(f'tosdr.org/api/1/service/{company}') as json_data:
        data_list.append(json.load(json_data))

In [654]:
#Create main dataframe, drop unnecessary columns, expand list of lists

pd_list = []
for data in data_list:
    pd_list.append(pd.DataFrame.from_dict(data['pointsData']).T.loc[:,['id','quoteText','services','title','tosdr']])

alldata_df = pd.concat(pd_list,axis=0).reset_index()
todsr_df = json_normalize(alldata_df['tosdr'])
df = pd.concat((alldata_df,todsr_df),axis=1)
df = df.drop('tosdr',axis=1).explode('services')

In [655]:
#create master document column with tldr as base and quote text as secondar

df['document'] = df['tldr']
df['document'] = np.where(df['document'] == '',df['quoteText'],df['document'])
df['document'] = np.where(df['document'] == 'Generated through the annotate view',df['quoteText'],df['document'])
df['document'].fillna(df['quoteText'])

#if there is no text in the tldr or quoteText column, drop and reset index
df.dropna(axis=0,subset=['document'],inplace=True)
df = df.reset_index(drop=True)

In [672]:
#create labels, the point is really to distinquish "bad" language so labeled 
#both good and neutral as a success

df['label'] = df['point']
df.label = df.label.replace(['bad','neutral','good'],['bad','good','good'])

In [673]:
df

Unnamed: 0,index,id,quoteText,services,title,binding,case,point,privacyRelated,score,tldr,sources,irrelevant,reason,tmp_rating,document,label
0,4411,4411,"Signal does not sell, rent or monetize your pe...",signal,This service does not sell your personal data,True,This service does not sell your personal data,good,True,25,"The terms of service state that ""Signal does n...",,,,,"The terms of service state that ""Signal does n...",good
1,4412,4412,You must be at least 13 years old to use our S...,signal,You must be at least 13 years old to use the s...,True,This service is only available to users of a c...,neutral,,15,"The terms of service state: ""You must be at le...",,,,,"The terms of service state: ""You must be at le...",good
2,4414,4414,You agree to resolve any Claim you have with u...,signal,Court of law is in California,True,The court of law governing the terms is in loc...,neutral,,0,"The terms of service state: ""You agree to reso...",,,,,"The terms of service state: ""You agree to reso...",good
3,4415,4415,"We may modify, suspend, or terminate your acce...",signal,The service can delete your account without pr...,True,The service can delete your account without pr...,bad,,60,"The terms of service state: ""We may modify, su...",,,,,"The terms of service state: ""We may modify, su...",bad
4,4416,4416,We work with third parties to provide some of ...,signal,Third parties may be involved in operating the...,True,Third parties may be involved in operating the...,bad,,15,"The privacy policy states: ""We work with third...",,,,,"The privacy policy states: ""We work with third...",bad
...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...,...
2555,4523,4523,,tos;dr,The terms for ToS;DR are easy to read,True,The terms for this service are easy to read,good,,15,"Altogether, ToS and PP only make up eight line...",,,,,"Altogether, ToS and PP only make up eight line...",good
2556,4524,4524,Nothing here should be considered legal advice...,tos;dr,ToS;DR gives no guarantee regarding quality,True,This services gives no guarantee regarding qua...,bad,,20,"The ToS state: ""Nothing here should be conside...",,,,,"The ToS state: ""Nothing here should be conside...",bad
2557,7679,7679,We do not use cookies or any other tracking te...,tos;dr,This service does not track you,True,This service does not track you,good,,100,Generated through the annotate view,,,,,We do not use cookies or any other tracking te...,good
2558,7683,7683,By contributing to this project (e.g.\nby send...,tos;dr,"If you offer suggestions to the service, they ...",True,"If you offer suggestions to the service, they ...",good,,50,Generated through the annotate view,,,,,By contributing to this project (e.g.\nby send...,good


In [541]:
#random lookup to check data

df.loc[np.random.randint(1,2564,5),['quoteText','tldr','document']]

Unnamed: 0,quoteText,tldr,document
1482,Cookies are small data files that are commonly...,This is probably more of a functional issue th...,This is probably more of a functional issue th...
582,"As we continue to develop our business, we mig...",Amazon may sell user data as part of a busines...,Amazon may sell user data as part of a busines...
2174,"Don’t use any kind of software, device or meth...",Generated through the annotate view,"Don’t use any kind of software, device or meth..."
735,We use Google Analytics which is a web analyti...,Generated through the annotate view,We use Google Analytics which is a web analyti...
662,In some circumstances and to the extent permit...,Generated through the annotate view,In some circumstances and to the extent permit...


In [575]:
#the land of sad documents
df.document[0],df.document[722]

('The terms of service state that "Signal does not sell, rent or monetize your personal data or content in any way – ever."',
 'Last updated: June 07, 2019')

# First Shitty Model 
### Start taking the parts of the dataframe that I want to keep

In [674]:
X = df['document']
y = df['label']

In [675]:
X_train, X_test, y_train, y_test = train_test_split(X,y)

In [661]:
'''uses a method in tfidf to explore the initial clean up step - lowering 
case, removing punctuation, and splitting words'''

tfiddy = TfidfVectorizer()
tfiddy.fit(X_train)
springcleaning = tfiddy.build_analyzer()
before = df['document'][1]
after = springcleaning(df['document'][1])
print(f'Before clean up: \n {before}\n')
print(f'After clean up: \n {after}\n')

Before clean up: 
 The terms of service state: "You must be at least 13 years old to use our Services. The minimum age to use our Services without parental approval may be higher in your home country."

After clean up: 
 ['the', 'terms', 'of', 'service', 'state', 'you', 'must', 'be', 'at', 'least', '13', 'years', 'old', 'to', 'use', 'our', 'services', 'the', 'minimum', 'age', 'to', 'use', 'our', 'services', 'without', 'parental', 'approval', 'may', 'be', 'higher', 'in', 'your', 'home', 'country']



In [734]:
'''create a list of clean companies that can be used as stop words - use 
springcleaning function to format them in a similar way to the rest of 
tfiddy
'''
dirty_companies = [company.split('.') for company in companies]
clean_companies = [springcleaning(company[0]) for company in dirty_companies]
cleanest_companies = [company for sublist in clean_companies for company in sublist]

In [736]:
cleanest_companies

['signal',
 'musik',
 'sammler',
 'whatismyip',
 'com',
 'email',
 'cz',
 'upcloud',
 'customink',
 'brilliant',
 'pure',
 'zenimaxmediainc',
 'qwant',
 'virtbiz',
 'any',
 'do',
 'visible',
 'sprint',
 'stackoverflow',
 'symantec',
 'gitlab',
 'airbnb',
 'websaver',
 'mcdonald',
 'kitsu',
 'meetup',
 'weebly',
 'pexgle',
 'apple',
 'web',
 'de',
 'dr',
 'mcdougall',
 'shealth',
 'medicalcenter',
 'newegg',
 'com',
 'yelp',
 'carfax',
 'jetbrains',
 'diytubevideocommunity',
 'windowslogicproductions',
 'virgin',
 'discogs',
 'digitaladvertisingplatform',
 'reklamstore',
 'cnn',
 'forbes',
 'npm',
 'w3schools',
 'etesync',
 'librarything',
 'wikimedia',
 'algolia',
 'imdb',
 'freecodecamp',
 'steam',
 'crunchyroll',
 'reputation',
 'coursehero',
 'vox',
 'osu',
 'nordvpn',
 'myspace',
 'tellonym',
 'goguardian',
 'quora',
 'lastpass',
 'mewe',
 'chilliapps',
 'abandonmentprotector',
 'ancestry',
 'pythonanywhere',
 'sonic',
 'net',
 'wikia',
 'allrecipes',
 'kongregate',
 'moddb',
 'com

In [737]:
tfiddy = TfidfVectorizer(stop_words=cleanest_companies)
tfiddy.fit(X_train)

TfidfVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.float64'>, encoding='utf-8',
                input='content', lowercase=True, max_df=1.0, max_features=None,
                min_df=1, ngram_range=(1, 1), norm='l2', preprocessor=None,
                smooth_idf=True,
                stop_words=['signal', 'musik', 'sammler', 'whatismyip', 'com',
                            'email', 'cz', 'upcloud', 'customink', 'brilliant',
                            'pure', 'zenimaxmediainc', 'qwant', 'virtbiz',
                            'any', 'do', 'visible', 'sprint', 'stackoverflow',
                            'symantec', 'gitlab', 'airbnb', 'websaver',
                            'mcdonald', 'kitsu', 'meetup', 'weebly', 'pexgle',
                            'apple', 'web', ...],
                strip_accents=None, sublinear_tf=False,
                token_pattern='(?u)\\b\\w\\w+\\b', tokenizer=None, use_idf=True,
                vocabular

In [738]:
X_train_tfiddy = tfiddy.transform(X_train).toarray()
X_test_tfiddy = tfiddy.transform(X_test).toarray()

In [703]:
#first shitty model!

from sklearn.naive_bayes import MultinomialNB
model = MultinomialNB()
model.fit(X_train_tfiddy, y_train)
y_hat = model.predict(X_test_tfiddy)

thisisdumb = y_test + y_hat 
(thisisdumb == 'badgood').sum()
(thisisdumb == 'goodbad').sum()
model.score(X_test_tfiddy, y_test)

0.825

In [740]:
model = MultinomialNB()
model.fit(X_train_tfiddy, y_train)
y_hat = model.predict(X_test_tfiddy)

thisisdumb = y_test + y_hat 
(thisisdumb == 'badgood').sum()
(thisisdumb == 'goodbad').sum()
model.score(X_test_tfiddy, y_test)

0.821875