In [39]:
%load_ext autoreload
%autoreload 2

The autoreload extension is already loaded. To reload it, use:
  %reload_ext autoreload


# Imports

In [40]:
# the usual imports
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from collections import Counter
# nlp imports
from nltk.corpus import stopwords
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
# model imports
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import GaussianNB,MultinomialNB,ComplementNB
from sklearn.linear_model import LogisticRegression,LogisticRegressionCV
from sklearn.ensemble import RandomForestClassifier, GradientBoostingClassifier
# metric imports
from sklearn.metrics import roc_curve, auc
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize

In [395]:
from apps.tosclassifier import *

# Create Master DF

## Data Cleaning

All of the data for ToS;DR is stored in a directory of different json files, each labeled with a different company name. 

In [406]:
cleaner = ToS_DataCleaner('../tosdr.org/api/1/service')

In [408]:
companies = pd.Series()

In [414]:
companies.to_csv('apps/companies_df.csv')

AttributeError: 'list' object has no attribute 'to_csv'

In [409]:
df = cleaner.create_df()

In [413]:
df.to_csv('apps/tos_df.csv')

In [410]:
tos = ToS_Classifier(df)

In [412]:
X,y = tos.get_data()

In [389]:
X = tos.vectorize_text(X,companies,new_X=None,y=None,cross_validate=False)

In [381]:
tos.ensemble_fit(X,y)

KeyboardInterrupt: 

In [390]:
import pickle
with open('apps/tos_classifier.pkl', 'wb') as f:
    pickle.dump(tos, f)

In [393]:
model = pickle.load(open('apps/tos_classifier.pkl','rb'))

In [363]:
X,y = model.get_data()

In [364]:
proba = model.predict_proba(X,input_user=True,input_X=terms_EU)

In [365]:
colors = model.get_colors(proba[:,1])

In [366]:
colors

['#FAA181',
 '#F2DE97',
 '#FAA181',
 '#F2DE97',
 '#DFEEB9',
 '#F77B7E',
 '#F2DE97',
 '#F2DE97',
 '#C7FEDD',
 '#F2DE97',
 '#F77B7E',
 '#F2DE97',
 '#FAA181',
 '#DFEEB9',
 '#F2DE97',
 '#DFEEB9',
 '#DFEEB9',
 '#FAA181',
 '#F77B7E',
 '#FAA181',
 '#F77B7E']

In [289]:
twitch_tos = '''Twitch allows users to distribute streaming live and pre-recorded audio-visual works, to use services, such as chat, bulletin boards, forum postings, wiki contributions, voice interactive services, and to participate in other activities in which you may create, post, transmit, perform, or store content, messages, text, sound, images, applications, code or other data or materials on the Twitch Services (“User Content”).

(i) Unless otherwise agreed to in a written agreement between you and Twitch that was signed by an authorized representative of Twitch, if you submit, transmit, display, perform, post or store User Content using the Twitch Services, you grant Twitch and its sublicensees, to the furthest extent and for the maximum duration permitted by applicable law (including in perpetuity if permitted under applicable law), an unrestricted, worldwide, irrevocable, fully sub-licenseable, nonexclusive, and royalty-free right to (a) use, reproduce, modify, adapt, publish, translate, create derivative works from, distribute, perform and display such User Content (including without limitation for promoting and redistributing part or all of the Twitch Services (and derivative works thereof)) in any form, format, media or media channels now known or later developed or discovered; and (b) use the name, identity, likeness and voice (or other biographical information) that you submit in connection with such User Content. Should such User Content contain the name, identity, likeness and voice (or other biographical information) of third parties, you represent and warrant that you have obtained the appropriate consents and/or licenses for your use of such features and that Twitch and its sub-licensees are allowed to use them to the extent indicated in these Terms of Service.

(ii) With respect to User Content known as “add-ons”, “maps”, “mods”, or other types of projects submitted through CurseForge.com or related sites (“Submitted Projects”), the rights granted by you hereunder terminate once you remove or delete such Submitted Projects from the Twitch Services. You also acknowledge that Twitch may retain, but not display, distribute, or perform, server copies of Submitted Projects that have been removed or deleted.

(iii) With respect to streaming live and pre-recorded audio-visual works, the rights granted by you hereunder terminate once you delete such User Content from the Twitch Services, or generally by closing your account, except (a) to the extent you shared it with others as part of the Twitch Services and others copied or stored portions of the User Content (e.g., made a Clip); (b) Twitch used it for promotional purposes; and (c) for the reasonable time it takes to remove from backup and other systems.

You are solely responsible for your User Content and the consequences of posting or publishing it. You represent, and warrant that: (1) you are the creator and owner of the User Content or otherwise have sufficient rights and authority to grant the rights granted herein; (2) your User Content does not and will not (a) infringe, violate, or misappropriate any third-party right, including any copyright, trademark, patent, trade secret, moral right, privacy right, right of publicity, or any other intellectual property or proprietary right or (b) defame any other person; (3) your User Content does not contain any viruses, adware, spyware, worms, or other harmful or malicious code; and (4) unless you have received prior written authorization, your User Content specifically does not contain any pre-release or non-public beta software or game content or any confidential information of Twitch or third parties. Twitch reserves all rights and remedies against any users who breach these representations and warranties.

Twitch uses reasonable security measures in order to attempt to protect User Content against unauthorized copying and distribution. However, Twitch does not guarantee that any unauthorized copying, use or distribution of User Content by third parties will not take place. To the furthest extent permitted by applicable law, you hereby agree that Twitch shall not be liable for any unauthorized copying, use or distribution of User Content by third parties and release and forever waive any claims you may have against Twitch for any such unauthorized copying or usage of the User Content, under any theory. THE SECURITY MEASURES TO PROTECT USER CONTENT USED BY TWITCH HEREIN ARE PROVIDED AND USED “AS-IS” AND WITH NO WARRANTIES, GUARANTEES, CONDITIONS, ASSURANCES OR OTHER TERMS THAT SUCH SECURITY MEASURES WILL WITHSTAND ATTEMPTS TO EVADE SECURITY MECHANISMS OR THAT THERE WILL BE NO CRACKS, DISABLEMENTS OR OTHER CIRCUMVENTION OF SUCH SECURITY MEASURES.

Users may promote, administer, or conduct a promotion (a contest or sweepstakes) on, through or utilizing the Twitch Services (a “ Promotion ”). If you choose to promote, administer or conduct a Promotion, you must adhere to the following rules: (i) You may carry out Promotions to the extent permitted by applicable law and you are solely responsible for ensuring that any Promotions comply with any and all applicable laws, obligations, and restrictions. (ii) You will be classified as the promoter of your Promotion in the applicable jurisdiction(s) and you will be solely responsible for all aspects of and expenses related to your Promotion, including, without limitation, the execution, administration, and operation of the Promotion; drafting and posting any official rules; selecting winners; issuing prizes; and obtaining all necessary third-party permissions and approvals, including, without limitation, filing any and all necessary registrations and bonds. Twitch has the right to remove your Promotion from the Twitch Services if Twitch reasonably believes that your Promotion does not comply with the Terms of Service or applicable law. (iii) Twitch is not responsible for and does not endorse or support any such Promotions. You may not indicate that Twitch is a sponsor or co-sponsor of the Promotion. (iv) You will display or read out the following when a Promotion is connection with your Promotion: “This is a promotion by [Your Name]. Twitch does not sponsor or endorse [Your Name]’s promotion and is not responsible for this promotion”.

You agree that your User Content will comply with the FTC’s Guidelines Concerning the Use of Testimonials and Endorsements in Advertising, the FTC’s .com Disclosures Guide , the FTC’s Native Advertising Guidelines, and any other guidelines issued by the FTC from time to time (“FTC Guidelines”), as well as any other advertising guidelines required under applicable law. For example, if you have been paid or provided with free products in exchange for discussing or promoting a product or service through the Twitch Services, or if you are an employee of a company and you decide to discuss or promote that company’s products or services through the Twitch Services, you agree to comply with the FTC Guidelines’ requirements for disclosing such relationships. You, and not Twitch, are solely responsible for any endorsements or testimonials you make regarding any product or service through the Twitch Services.

Subject to these Terms of Service and the Community Guidelines, you may share political opinions; participate in political activity; provide links to a political committee’s official website, including the contribution page of a political committee; and solicit viewers to make contributions directly to a political committee. You agree, however, that these activities are entirely your own. Moreover, by engaging in these activities, you represent and warrant that you are eligible to engage in them under applicable law, and that you will abide by all relevant laws and regulations while doing so. 

You agree not to solicit the use of or use any Twitch monetization tool (e.g., Bits or subscriptions) for the purpose of making or delivering a contribution to a candidate, candidate’s committee, political action committee, ballot committee, or any other campaign committee, or otherwise for the purpose of influencing any election. Candidates for political office are not eligible to use any Twitch monetization tool on their channels.'''

In [285]:
tiktok_tos = '''Users of the Services may be permitted to upload, post or transmit (such as via a stream) or otherwise make available content through the Services including, without limitation, any text, photographs, user videos, sound recordings and the musical works embodied therein, including videos that incorporate locally stored sound recordings from your personal music library and ambient noise (“User Content”). Users of the Services may also extract all or any portion of User Content created by another user to produce additional User Content, including collaborative User Content with other users, that combine and intersperse User Content generated by more than one user. Users of the Services may also overlay music, graphics, stickers, Virtual Items (as defined and further explained Virtual Items Policy) and other elements provided by TikTok (“TikTok Elements”) onto this User Content and transmit this User Content through the Services. The information and materials in the User Content, including User Content that includes TikTok Elements, have not been verified or approved by us. The views expressed by other users on the Services (including through use of the virtual gifts) do not represent our views or values.

Whenever you access or use a feature that allows you to upload or transmit User Content through the Services (including via certain third party social media platforms such as Instagram, Facebook, YouTube, Twitter), or to make contact with other users of the Services, you must comply with the standards set out at “Your Access to and Use of Our Services” above. You may also choose to upload or transmit your User Content, including User Content that includes TikTok Elements, on sites or platforms hosted by third parties. If you decide to do this, you must comply with their content guidelines as well as with the standards set out at “Your Access to and Use of Our Services” above. As noted above, these features may not be available to all users of the Services, and we have no liability to you for limiting your right to certain features of the Services.

You warrant that any such contribution does comply with those standards, and you will be liable to us and indemnify us for any breach of that warranty. This means you will be responsible for any loss or damage we suffer as a result of your breach of warranty.

Any User Content will be considered non-confidential and non-proprietary. You must not post any User Content on or through the Services or transmit to us any User Content that you consider to be confidential or proprietary. When you submit User Content through the Services, you agree and represent that you own that User Content, or you have received all necessary permissions, clearances from, or are authorised by, the owner of any part of the content to submit it to the Services, to transmit it from the Services to other third party platforms, and/or adopt any third party content.

If you only own the rights in and to a sound recording, but not to the underlying musical works embodied in such sound recordings, then you must not post such sound recordings to the Services unless you have all permissions, clearances from, or are authorised by, the owner of any part of the content to submit it to the Services

You or the owner of your User Content still own the copyright in User Content sent to us, but by submitting User Content via the Services, you hereby grant us an unconditional irrevocable, non-exclusive, royalty-free, fully transferable, perpetual worldwide licence to use, modify, adapt, reproduce, make derivative works of, publish and/or transmit, and/or distribute and to authorise other users of the Services and other third-parties to view, access, use, download, modify, adapt, reproduce, make derivative works of, publish and/or transmit your User Content in any format and on any platform, either now known or hereinafter invented.

You further grant us a royalty-free license to use your user name, image, voice, and likeness to identify you as the source of any of your User Content; provided, however, that your ability to provide an image, voice, and likeness may be subject to limitations due to age restrictions.

For the avoidance of doubt, the rights granted in the preceding paragraphs of this Section include, but are not limited to, the right to reproduce sound recordings (and make mechanical reproductions of the musical works embodied in such sound recordings), and publicly perform and communicate to the public sound recordings (and the musical works embodied therein), all on a royalty-free basis. This means that you are granting us the right to use your User Content without the obligation to pay royalties to any third party, including, but not limited to, a sound recording copyright owner (e.g., a record label), a musical work copyright owner (e.g., a music publisher), a performing rights organization (e.g., ASCAP, BMI, SESAC, etc.) (a “PRO”), a sound recording PRO (e.g., SoundExchange), any unions or guilds, and engineers, producers or other royalty participants involved in the creation of User Content.

Specific Rules for Musical Works and for Recording Artists. If you are a composer or author of a musical work and are affiliated with a PRO, then you must notify your PRO of the royalty-free license you grant through these Terms in your User Content to us. You are solely responsible for ensuring your compliance with the relevant PRO’s reporting obligations. If you have assigned your rights to a music publisher, then you must obtain the consent of such music publisher to grant the royalty-free license(s) set forth in these Terms in your User Content or have such music publisher enter into these Terms with us. Just because you authored a musical work (e.g., wrote a song) does not mean you have the right to grant us the licenses in these Terms. If you are a recording artist under contract with a record label, then you are solely responsible for ensuring that your use of the Services is in compliance with any contractual obligations you may have to your record label, including if you create any new recordings through the Services that may be claimed by your label.

Through-To-The-Audience Rights. All of the rights you grant in your User Content in these Terms are provided on a through-to-the-audience basis, meaning the owners or operators of third party services will not have any separate liability to you or any other third party for User Content posted or used on such third party service via the Services.

Waiver of Rights to User Content. By posting User Content to or through the Services, you waive any rights to prior inspection or approval of any marketing or promotional materials related to such User Content. You also waive any and all rights of privacy, publicity, or any other rights of a similar nature in connection with your User Content, or any portion thereof. To the extent any moral rights are not transferable or assignable, you hereby waive and agree never to assert any and all moral rights, or to support, maintain or permit any action based on any moral rights that you may have in or with respect to any User Content you Post to or through the Services.

We also have the right to disclose your identity to any third party who is claiming that any User Content posted or uploaded by you to our Services constitutes a violation of their intellectual property rights, or of their right to privacy.

We, or authorised third parties, reserve the right to cut, crop, edit or refuse to publish, your content at our or their sole discretion. We have the right to remove, disallow, block or delete any posting you make on our Services if, in our opinion, your post does not comply with the content standards set out at “Your Access to and Use of Our Services”above. In addition, we have the right – but not the obligation – in our sole discretion to remove, disallow, block or delete any User Content (i) that we consider to violate these Terms, or (ii) in response to complaints from other users or third parties, with or without notice and without any liability to you. As a result, we recommend that you save copies of any User Content that you post to the Services on your personal device(s) in the event that you want to ensure that you have permanent access to copies of such User Content. We do not guarantee the accuracy, integrity, appropriateness or quality of any User Content, and under no circumstances will we be liable in any way for any User Content.

You control whether your User Content is made publicly available on the Services to all other users of the Services or only available to people you approve. To restrict access to your User Content, you should select the privacy setting available within the Platform.

We accept no liability in respect of any content submitted by users and published by us or by authorised third parties.

If you wish to complain about information and materials uploaded by other users please contact us at: feedback@tiktok.com.

TikTok takes reasonable measures to expeditiously remove from our Services any infringing material that we become aware of.It is TikTok’s policy, in appropriate circumstances and at its discretion, to disable or terminate the accounts of users of the Services who repeatedly infringe copyrights or intellectual property rights of others.

While our own staff is continually working to develop and evaluate our own product ideas and features, we pride ourselves on paying close attention to the interests, feedback, comments, and suggestions we receive from the user community. If you choose to contribute by sending us or our employees any ideas for products, services, features, modifications, enhancements, content, refinements, technologies, content offerings (such as audio, visual, games, or other types of content), promotions, strategies, or product/feature names, or any related documentation, artwork, computer code, diagrams, or other materials (collectively “Feedback”), then regardless of what your accompanying communication may say, the following terms will apply, so that future misunderstandings can be avoided. Accordingly, by sending Feedback to us, you agree that:

TikTok has no obligation to review, consider, or implement your Feedback, or to return to you all or part of any Feedback for any reason;

Feedback is provided on a non-confidential basis, and we are not under any obligation to keep any Feedback you send confidential or to refrain from using or disclosing it in any way; and

You irrevocably grant us perpetual and unlimited permission to reproduce, distribute, create derivative works of, modify, publicly perform (including on a through-to-the-audience basis), communicate to the public, make available, publicly display, and otherwise use and exploit the Feedback and derivatives thereof for any purpose and without restriction, free of charge and without attribution of any kind, including by making, using, selling, offering for sale, importing, and promoting commercial products and services that incorporate or embody Feedback, whether in whole or in part, and whether as provided or as modified.'''

In [279]:
youtube_tos = '''If you have a YouTube channel, you may be able to upload Content to the Service. You may use your Content to promote your business or artistic enterprise. If you choose to upload Content, you must not submit to the Service any Content that does not comply with this Agreement (including the YouTube Community Guidelines) or the law. For example, the Content you submit must not include third-party intellectual property (such as copyrighted material) unless you have permission from that party or are otherwise legally entitled to do so. You are legally responsible for the Content you submit to the Service. We may use automated systems that analyze your Content to help detect infringement and abuse, such as spam, malware, and illegal content.

You retain ownership rights in your Content. However, we do require you to grant certain rights to YouTube and other users of the Service, as described below.

By providing Content to the Service, you grant to YouTube a worldwide, non-exclusive, royalty-free, sublicensable and transferable license to use that Content (including to reproduce, distribute, prepare derivative works, display and perform it) in connection with the Service and YouTube’s (and its successors' and Affiliates') business, including for the purpose of promoting and redistributing part or all of the Service.


You also grant each other user of the Service a worldwide, non-exclusive, royalty-free license to access your Content through the Service, and to use that Content, including to reproduce, distribute, prepare derivative works, display, and perform it, only as enabled by a feature of the Service (such as video playback or embeds). For clarity, this license does not grant any rights or permissions for a user to make use of your Content independent of the Service.

The licenses granted by you continue for a commercially reasonable period of time after you remove or delete your Content from the Service. You understand and agree, however, that YouTube may retain, but not display, distribute, or perform, server copies of your videos that have been removed or deleted. 


You may remove your Content from the Service at any time. You also have the option to make a copy of your Content before removing it. You must remove your Content if you no longer have the rights required by these terms.


If we reasonably believe that any Content is in breach of this Agreement or may cause harm to YouTube, our users, or third parties, we may remove or take down that Content in our discretion. We will notify you with the reason for our action unless we reasonably believe that to do so: (a) would breach the law or the direction of a legal enforcement authority or would otherwise risk legal liability for YouTube or our Affiliates; (b) would compromise an investigation or the integrity or operation of the Service; or (c) would cause harm to any user, other third party, YouTube or our Affiliates. You can learn more about reporting and enforcement, including how to appeal on the Troubleshooting page of our Help Center.


We provide information to help copyright holders manage their intellectual property online in our YouTube Copyright Center. If you believe your copyright has been infringed on the Service, please send us a notice.

We respond to notices of alleged copyright infringement according to the process in our YouTube Copyright Center, where you can also find information about how to resolve a copyright strike. YouTube's policies provide for the termination, in appropriate circumstances, of repeat infringers’ access to the Service.'''

In [300]:
soundcloud_tos = '''Your content

Any and all audio, text, photos, pictures, graphics, comments, and other content, data or information that you upload, store, transmit, submit, exchange or make available to or via the Platform (hereinafter "Your Content") is generated, owned and controlled solely by you, and not by SoundCloud.

SoundCloud does not claim any ownership rights in Your Content, and you hereby expressly acknowledge and agree that Your Content remains your sole responsibility.

Without prejudice to the conditions set forth in Your Use of the Platform you must not upload, store, distribute, send, transmit, display, perform, make available, continue to make available or otherwise communicate to the public any Content to which you do not hold the necessary rights. In particular, any unauthorized use of copyright protected material within Your Content (including by way of reproduction, distribution, modification, adaptation, public display, public performance, preparation of derivative works, making available or otherwise communicating to the public via the Platform), independent of wheteher it is or becomes unauthorized at a later point, may constitute an infringement of third party rights and is strictly prohibited. Any such infringements may result in termination of your access to the Platform as described in the Repeat Infringers section below, and may also result in civil litigation or criminal prosecution by or on behalf of the relevant rightsholder.

We may, from time to time, invite or provide you with means to provide feedback regarding the Platform, and in such circumstances, any feedback you provide will be deemed non-confidential and SoundCloud shall have the right, but not the obligation, to use such feedback on an unrestricted basis.

By uploading or posting Your Content to the Platform, you initiate an automated process to transcode any audio Content and direct SoundCloud to store Your Content on our servers, from where you may control and authorize the use, ways of reproduction, transmission, distribution, public display, public performance, making available (including whether users will be permitted to listen to your Content offline) and other communication to the public of Your Content on the Platform and elsewhere using the Services. To the extent it is necessary in order for SoundCloud to provide you with any of the aforementioned hosting services, to undertake any of the tasks set forth in these Terms of Use, including the distribution of advertising or other promotional material on our Platform and/or to enable your use of the Platform, you hereby grant such licenses to SoundCloud on a limited, worldwide, non-exclusive, royalty-free and fully paid basis.

By uploading Your Content to the Platform, you also grant a limited, worldwide, non-exclusive, royalty-free, fully paid up, license to other users of the Platform, and to operators and users of any other websites, apps and/or platforms to which Your Content has been shared or embedded using the Services ("Linked Services"), to use, copy, listen to offline, repost, transmit or otherwise distribute, publicly display, publicly perform, adapt, prepare derivative works of, compile, make available and otherwise communicate to the public, Your Content utilizing the features of the Platform from time to time, and within the parameters set by you using the Services. You can limit and restrict the availability of certain of Your Content to other users of the Platform, and to users of Linked Services, at any time using the permissions tab in the track edit section for each sound you upload, subject to the provisions of the Disclaimer section below. Notwithstanding the foregoing, nothing in these Terms of Use grants any rights to any other user of the Platform with respect to any proprietary name, logo, trademark or service mark uploaded by you as part of Your Content (for example, your profile picture) ("Marks"), other than the right to reproduce, publicly display, make available and otherwise communicate to the public those Marks, automatically and without alteration, as part of the act of reposting sounds with which you have associated those Marks.

The licenses granted in this section are granted separately with respect to each item of Your Content that you upload to the Platform. Licenses with respect to audio Content, and any images or text within your account, will (subject to the following paragraph of these Terms of Use) terminate automatically when you remove such Content from your account. Licenses with respect to comments or other contributions that you make on the Platform will be perpetual and irrevocable, and will continue notwithstanding any termination of your account.

Removal of audio Content from your account will automatically result in the deletion of the relevant files from SoundCloud’s systems and servers. However, notwithstanding the foregoing, you hereby acknowledge and agree that once Your Content is distributed to a Linked Service, SoundCloud is not obligated to ensure the deletion of Your Content from any servers or systems operated by the operators of any Linked Service, or to require that any user of the Platform or any Linked Service deletes any item of Your Content. Furthermore, if you authorize any of Your Content to be available for offline listening, after deletion of an item of Your Content or removal from the ability for other users to listen to the applicable Content offline, the applicable Content may still be temporarily available to other users of the Platform who saved the applicable Content for offline listening on their devices, but no longer than 30 days from the time of deletion.

Any Content other than Your Content is the property of the relevant Uploader, and is or may be subject to copyright, trademark rights or other intellectual property or proprietary rights. Such Content may not be downloaded, reproduced, distributed, transmitted, re-uploaded, republished, displayed, sold, licensed, made available or otherwise communicated to the public or exploited for any purposes except via the features of the Platform from time to time and within the parameters set by the Uploader on the Platform or with the express written consent of the Uploader. Where you repost another user’s Content, or include another user’s Content in a playlist or station or where you listen to another user’s Content offline, you acquire no ownership rights whatsoever in that Content. Subject to the rights expressly granted in this section, all rights in Content are reserved to the relevant Uploader.'''

In [284]:
reddit_tos = '''

The Services may contain information, text, links, graphics, photos, videos, or other materials (“Content”), including Content created with or submitted to the Services by you or through your Account (“Your Content”). We take no responsibility for and we do not expressly or implicitly endorse any of Your Content.

By submitting Your Content to the Services, you represent and warrant that you have all rights, power, and authority necessary to grant the rights to Your Content contained within these Terms. Because you alone are responsible for Your Content, you may expose yourself to liability if you post or share Content without all necessary rights.

You retain any ownership rights you have in Your Content, but you grant Reddit the following license to use that Content:

When Your Content is created with or submitted to the Services, you grant us a worldwide, royalty-free, perpetual, irrevocable, non-exclusive, transferable, and sublicensable license to use, copy, modify, adapt, prepare derivative works from, distribute, perform, and display Your Content and any name, username, voice, or likeness provided in connection with Your Content in all media formats and channels now known or later developed. This license includes the right for us to make Your Content available for syndication, broadcast, distribution, or publication by other companies, organizations, or individuals who partner with Reddit. You also agree that we may remove metadata associated with Your Content, and you irrevocably waive any claims and assertions of moral rights or attribution with respect to Your Content.

Any ideas, suggestions, and feedback about Reddit or our Services that you provide to us are entirely voluntary, and you agree that Reddit may use such ideas, suggestions, and feedback without compensation or obligation to you.

Although we have no obligation to screen, edit, or monitor Your Content, we may, in our sole discretion, delete or remove Your Content at any time and for any reason, including for a violation of these Terms, a violation of our Content Policy, or if you otherwise create liability for us.
'''

# Feature Engineering

In [73]:
X,y

(0       Signal does not sell, rent or monetize your pe...
 1       You must be at least 13 years old to use our S...
 2       You agree to resolve any Claim you have with u...
 3       We may modify, suspend, or terminate your acce...
 4       We work with third parties to provide some of ...
                               ...                        
 2547    Phoenix is hosted on Heroku for development pu...
 2549    Nothing here should be considered legal advice...
 2550    We do not use cookies or any other tracking te...
 2551    By contributing to this project (e.g.\nby send...
 2552    We need you to license your contributions unde...
 Name: quoteText, Length: 2440, dtype: object,
 0       0
 1       0
 2       0
 3       1
 4       1
        ..
 2547    1
 2549    1
 2550    0
 2551    0
 2552    0
 Name: label, Length: 2440, dtype: int64)

In [74]:
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.25)

In [None]:
def countvoncount(stop_words,X_train,y_)
voncount = CountVectorizer(stop_words=companies,ngram_range=(1,2))
voncount.fit(X_train)
X_train_voncount = voncount.transform(X_train).toarray()
X_test_voncount = voncount.transform(X_test).toarray()

In [None]:
voncount = CountVectorizer(stop_words=companies)
voncount_all = voncount.fit_transform(df['document']).toarray()
toswords = voncount.get_feature_names()

In [None]:
all_sum = voncount_all.sum()
good_sum = voncount_all[df['point'] == 'good'].sum()
neutral_sum = voncount_all[df['point'] == 'neutral'].sum()
bad_sum = voncount_all[df['point'] == 'bad'].sum()
print(f'Count of all words: {all_sum} \nCount of words in good documents: {good_sum}\
\nCount of words in netural documents: {neutral_sum} \nCount of words in bad documents: {bad_sum}')

In [None]:
all_count = np.sum(voncount_all,axis=0)
good_count = np.sum(voncount_all[df['point'] =='good'],axis = 0)
neutral_count = np.sum(voncount_all[df['point'] =='neutral'],axis = 0)
bad_count = np.sum(voncount_all[df['point'] =='bad'],axis = 0)

In [None]:
# Distinguishing words

good_words = {}
neutral_words = {}
bad_words = {}
alph = 1.2

for class_word_count,total_word_count,word in zip(good_count,all_count,toswords):
    good_words[word] = ((class_word_count**alph)/total_word_count).round(2)
for class_word_count,total_word_count,word in zip(neutral_count,all_count,toswords):
    neutral_words[word] = ((class_word_count**alph)/total_word_count).round(2)
for class_word_count,total_word_count,word in zip(bad_count,all_count,toswords):
    bad_words[word] = ((class_word_count**alph)/total_word_count).round(2)
    
good_15 = pd.DataFrame(Counter(good_words).most_common(15))
neutral_15 = pd.DataFrame(Counter(neutral_words).most_common(15))
bad_15 = pd.DataFrame(Counter(bad_words).most_common(15))

unique_words = pd.concat([good_15,neutral_15,bad_15],axis=1)
unique_words

In [77]:
tfiddy = TfidfVectorizer(stop_words=companies,ngram_range=(1,2))
tfiddy.fit(X_train)
X_train_tfiddy = tfiddy.transform(X_train).toarray()
X_test_tfiddy = tfiddy.transform(X_test).toarray()

  'stop_words.' % sorted(inconsistent))


In [186]:
m_naive = MultinomialNB(alpha=.1)
m_naive.fit(X_train_tfiddy, y_train)
y_hat = m_naive.predict_proba(X_test_tfiddy)
y_hat
y_hat

array([[9.98815473e-01, 1.18452711e-03],
       [3.14736388e-03, 9.96852636e-01],
       [2.89268679e-02, 9.71073132e-01],
       ...,
       [4.81191452e-01, 5.18808548e-01],
       [9.56944805e-01, 4.30551951e-02],
       [9.99453701e-01, 5.46299497e-04]])

In [None]:
mnp_probs = m_naive.predict_proba(X_test_tfiddy)

In [None]:
import pickle
filename_model = 'apps/model.sav'
pickle.dump(m_naive, open(filename_model, 'wb'))

In [None]:
filename_tfidf = 'apps/tfidf.sav'
pickle.dump(tfiddy, open(filename_tfidf, 'wb'))

In [None]:
input_text = "something about indemnify"

In [None]:
tfidf_model = pickle.load(open(filename_tfidf,'rb'))
loaded_model = pickle.load(open(filename_model, 'rb'))
#input_vectorized = tfidf_model.transform(pd.Series(input_text)).toarray()
#classification,probability = loaded_model.predict(input_vectorized),loaded_model.predict_proba(input_vectorized)
#print(f'Classification:{classification}classification,f'probability)

In [None]:
from sklearn.metrics import roc_curve, auc
from sklearn import datasets
from sklearn.multiclass import OneVsRestClassifier
from sklearn.preprocessing import label_binarize
import matplotlib.pyplot as plt

In [None]:
clf = OneVsRestClassifier(MultinomialNB(alpha=.01))
y_score = clf.fit(X_train_tfiddy,ytrain_bin).predict_proba(X_test_tfiddy)
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(ytest_bin[:, i], y_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
fig, ax = plt.subplots(figsize=(14,6))
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('One Vs Rest ROC Curve - Multinomial Naive Bayes')
for i in range(3):
    ax.plot(fpr[i], tpr[i], label=f'ROC curve (area = {roc_auc[i]:.2f}) for {labels[i]} vs all')
ax.legend(loc="best")
ax.grid(alpha=.4)
sns.despine()
plt.show()

In [None]:
y_score.shape
roc_curve()

In [None]:
model.predict_proba(X_test_tfiddy)[4]

In [None]:
y_test.iloc[200]

In [None]:
model.predict(X_test_tfiddy)[200]

In [None]:
df.iloc[[1681,2194]]

In [None]:
truegood,trueneutral,truebad,neutral_notgood,neutral_notbad,good_notneutral,good_notbad,bad_notneutral,bad_notgood = 0,0,0,0,0,0,0,0,0

for predicted,actual in zip(y_hat,y_test):
    if predicted == actual:
        if predicted == 'good':
            truegood += 1
        if predicted == 'neutral':
            trueneutral += 1
        if predicted == 'bad':
            truebad += 1
    elif (actual == 'good') & (predicted == 'neutral'):
        good_notneutral += 1
    elif (actual == 'good') & (predicted == 'bad'):
        good_notbad += 1
    elif (actual == 'neutral') & (predicted == 'good'):
        neutral_notgood += 1
    elif (actual == 'neutral') & (predicted == 'bad'):
        neutral_notbad += 1
    elif (actual == 'bad') & (predicted == 'neutral'):
        bad_notneutral += 1
    elif (actual == 'bad') & (predicted == 'good'):
        bad_notgood += 1

In [None]:
actualgood = [truegood,good_notneutral,good_notbad]
actualneutral = [neutral_notgood,trueneutral,neutral_notbad]
actualbad = [bad_notgood,bad_notneutral,truebad]
actualgood
actualbad

In [None]:
#of course there is libary for this

from sklearn.metrics import confusion_matrix
matrix = confusion_matrix(y_test, y_hat)[::-1]
matrix

In [None]:
matrix = confusion_matrix(y_test, y_hat)[::-1]
fig,ax = plt.subplots()

sns.heatmap(matrix,annot=True,cbar=False,cmap='Blues', fmt='g')
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.title('Confusion Matrix')
plt.xticks([.5,1.5,2.5],labels=['Bad','Good','Neutral'])
plt.yticks([.5,1.5,2.5],labels=['Neutral','Good','Bad'])
plt.ylim(-.5,3)

In [None]:
#Multinomial NB with count vectorizer instead of TFIDF

model = MultinomialNB(alpha=.01)
model.fit(X_train_voncount, y_train)
y_hat = model.predict(X_test_voncount)

model.score(X_test_voncount, y_test)

In [None]:
#Complement NB

model = MultinomialNB(alpha=.3)
model.fit(X_train_tfiddy, y_train)
y_hat = model.predict(X_test_tfiddy)

#model.predict_proba(X_test_tfiddy)
model.score(X_test_tfiddy,y_test)

In [118]:
dime = LogisticRegressionCV(solver='lbfgs',Cs=100,max_iter=200)
dime.fit(X_train_tfiddy,y_train)
y_dime = dime.predict_proba(y_test)

ValueError: Expected 2D array, got 1D array instead:
array=[0 1 1 0 0 0 1 0 1 0 0 1 0 0 1 1 1 1 0 0 1 1 1 1 0 0 1 0 0 1 1 0 1 0 0 1 0
 0 1 1 1 0 0 1 1 1 1 0 1 1 1 1 0 0 0 1 1 0 1 0 1 1 1 0 1 1 0 0 1 1 0 0 0 1
 0 0 0 0 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 1 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 1 1
 0 1 0 0 0 1 1 1 1 0 0 0 1 1 0 0 1 1 1 0 1 0 0 0 1 1 1 1 0 0 0 0 1 1 1 1 1
 1 0 1 1 0 0 0 1 0 0 1 0 0 1 0 1 1 0 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 1 0 0 0
 0 0 0 0 1 0 0 1 1 0 0 0 0 1 0 0 0 1 0 0 1 0 0 0 1 1 1 0 0 1 1 0 0 1 0 1 1
 1 0 0 1 1 0 0 1 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 1 1 0 0 1 0 1 1 0 0 0 0 0 1
 1 0 0 0 0 1 0 1 0 0 1 0 0 0 0 0 0 0 1 1 0 0 0 1 1 1 1 0 0 1 1 1 0 1 1 1 1
 1 1 0 1 0 1 0 0 0 1 0 1 0 1 0 1 0 1 1 1 0 0 1 1 0 1 0 0 0 1 1 0 0 1 0 0 1
 0 0 0 1 0 0 0 1 0 1 0 1 1 1 1 0 1 0 0 0 1 1 0 1 0 1 1 1 1 0 0 0 0 0 1 1 0
 1 0 1 1 0 1 0 1 0 0 1 1 0 1 1 0 1 1 0 0 1 1 0 1 1 0 0 1 0 1 0 1 1 1 0 1 1
 0 0 0 0 0 0 0 0 1 1 0 1 1 0 0 0 1 1 0 0 1 0 0 1 1 1 0 0 1 0 0 1 0 0 1 0 1
 0 0 0 0 0 0 0 1 0 1 0 1 0 0 1 1 1 0 0 1 1 1 0 1 0 0 1 0 1 0 1 0 0 1 0 1 1
 1 0 0 1 1 1 0 0 0 0 1 1 1 1 0 1 1 1 0 0 1 0 1 0 0 1 0 0 0 1 1 0 0 1 0 0 0
 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 1 1 0 0 0 0 0 0 0 0 1 1 0 1 0 0 0 1 0 0
 0 1 1 1 1 1 0 0 0 1 0 0 1 1 1 1 1 0 1 0 1 0 1 1 1 0 0 0 0 0 1 1 0 0 1 0 0
 1 0 1 1 0 0 1 1 1 0 1 1 1 1 1 0 0 0].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.

In [None]:
y_dime = dime.predict_proba(y_test)

In [114]:
logreg = LogisticRegression(solver='newton-cg',C=100,max_iter=200)
logreg.fit(X_train_tfiddy,y_train)
logreg.score(X_test_tfiddy,y_test)

0.8639344262295082

In [187]:
logy_hats = logreg.predict_proba(X_test_tfiddy)

In [115]:
dime.score(X_test_tfiddy,y_test)

0.8655737704918033

In [None]:
y_dime = dime.predict(X_test_tfiddy)
log_prob = dime.predict_proba(X_test_tfiddy)

In [None]:
dime.classes_

In [35]:
tfiddy = TfidfVectorizer(ngram_range=(1,2))
tfiddy.fit(X_train)
X_train_tfiddy = tfiddy.transform(X_train).toarray()
X_test_tfiddy = tfiddy.transform(X_test).toarray()

model = MultinomialNB(alpha=.02)
model.fit(X_train_tfiddy, y_train)
y_mnb = model.predict(X_test_tfiddy)

model.score(X_test_tfiddy, y_test)

NameError: name 'X_train' is not defined

In [None]:
model.classes_

In [189]:
combined_proba = (y_hat + logy_hats)/2

In [193]:
(np.amax(combined_proba,axis=1)<0.6)

array([False, False, False, False, False, False, False, False,  True,
       False, False, False, False, False, False, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False,  True, False, False, False, False, False, False, False,
       False, False, False,  True, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False, False, False, False, False, False,
       False, False, False, False,  True, False, False, False, False,
       False, False, False, False, False,  True, False, False, False,
       False, False,  True, False, False, False, False, False, False,
       False, False,

In [None]:
pd.concat([pd.Series(X_test),pd.Series(y_test),pd.Series(y_pred)])

In [None]:
y_pred = model.classes_[arg]
y_pred.reshape()

In [None]:
arg = np.argmax(a,axis=1).reshape(610,1)

In [None]:
array = np.full((610,3),fill_value=False)

In [None]:
array[np.arange(len(a)),a.argmax(1)] = True

In [None]:
dime.score(X_test_tfiddy,y_test)

In [None]:
lostwoods = RandomForestClassifier(n_estimators=2000)
lostwoods.fit(X_train_tfiddy,y_train)

In [None]:
lostwoods.score(X_test_tfiddy,y_test)

In [None]:
gradientfalls = GradientBoostingClassifier(n_estimators=1000,subsample=0.5)
gradientfalls.fit(X_train_tfiddy,y_train)

In [None]:
gradientfalls.score(X_test_tfiddy,y_test)
gradientfalls.predict_

# Metrics

In [None]:
fig,axs = plt.subplots(1,3)

preds = [y_dime,y_mnb,y_pred]

for i,ax in enumerate(axs):
    confusion_matrix(y_test, preds[i])[::-1]
    sns.heatmap(confusion_matrix(y_test, preds[i])[::-1],ax=ax,annot=True,cbar=False,cmap='Blues', fmt='g')
    ax.set_title('Confusion Matrix')
    ax.set_ylabel('Actual')
    ax.set_xlabel('Predicted')
    ax.set_xticks([.5,1.5,2.5])
    #ax.set_xlabel(['Bad','Good','Neutral'])
    ax.set_yticks([.5,1.5,2.5])#,labels=['Neutral','Good','Bad'])
    ax.set_ylim(-.5,3)
    
plt.tight_layout()
#plt.xticks(ticks=[.5,1.5,2.5],labels=['Bad','Good','Neutral'])

In [None]:
clflost = OneVsRestClassifier(RandomForestClassifier(n_estimators=100))
ylost_score = clflost.fit(X_train_tfiddy,ytrain_bin).predict_proba(X_test_tfiddy)

In [None]:
fpr = dict()
tpr = dict()
roc_auc = dict()
labels = ['bad','good','neutral']
for i in range(3):
    fpr[i], tpr[i], _ = roc_curve(ytest_bin[:, i], ylost_score[:, i])
    roc_auc[i] = auc(fpr[i], tpr[i])
fig, ax = plt.subplots(figsize=(14,6))
ax.plot([0, 1], [0, 1], 'k--')
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.set_xlabel('False Positive Rate')
ax.set_ylabel('True Positive Rate')
ax.set_title('One Vs Rest ROC Curve - RandomForest')
for i in range(3):
    ax.plot(fpr[i], tpr[i], label=f'ROC curve (area = {roc_auc[i]:.2f}) for {labels[i]} vs all')
ax.legend(loc="best")
ax.grid(alpha=.4)
sns.despine()
plt.show()

In [198]:
terms_US = '''Users of the Services may be permitted to upload, post or transmit (such as via a stream) or otherwise make available content through the Services including, without limitation, any text, photographs, user videos, sound recordings and the musical works embodied therein, including videos that incorporate locally stored sound recordings from your personal music library and ambient noise (“User Content”). Users of the Services may also extract all or any portion of User Content created by another user to produce additional User Content, including collaborative User Content with other users, that combine and intersperse User Content generated by more than one user. Users of the Services may also overlay music, graphics, stickers, Virtual Items (as defined and further explained Virtual Items Policy) and other elements provided by TikTok (“TikTok Elements”) onto this User Content and transmit this User Content through the Services. The information and materials in the User Content, including User Content that includes TikTok Elements, have not been verified or approved by us. The views expressed by other users on the Services (including through use of the virtual gifts) do not represent our views or values.

Whenever you access or use a feature that allows you to upload or transmit User Content through the Services (including via certain third party social media platforms such as Instagram, Facebook, YouTube, Twitter), or to make contact with other users of the Services, you must comply with the standards set out at “Your Access to and Use of Our Services” above. You may also choose to upload or transmit your User Content, including User Content that includes TikTok Elements, on sites or platforms hosted by third parties. If you decide to do this, you must comply with their content guidelines as well as with the standards set out at “Your Access to and Use of Our Services” above. As noted above, these features may not be available to all users of the Services, and we have no liability to you for limiting your right to certain features of the Services.

You warrant that any such contribution does comply with those standards, and you will be liable to us and indemnify us for any breach of that warranty. This means you will be responsible for any loss or damage we suffer as a result of your breach of warranty.

Any User Content will be considered non-confidential and non-proprietary. You must not post any User Content on or through the Services or transmit to us any User Content that you consider to be confidential or proprietary. When you submit User Content through the Services, you agree and represent that you own that User Content, or you have received all necessary permissions, clearances from, or are authorised by, the owner of any part of the content to submit it to the Services, to transmit it from the Services to other third party platforms, and/or adopt any third party content.

If you only own the rights in and to a sound recording, but not to the underlying musical works embodied in such sound recordings, then you must not post such sound recordings to the Services unless you have all permissions, clearances from, or are authorised by, the owner of any part of the content to submit it to the Services

You or the owner of your User Content still own the copyright in User Content sent to us, but by submitting User Content via the Services, you hereby grant us an unconditional irrevocable, non-exclusive, royalty-free, fully transferable, perpetual worldwide licence to use, modify, adapt, reproduce, make derivative works of, publish and/or transmit, and/or distribute and to authorise other users of the Services and other third-parties to view, access, use, download, modify, adapt, reproduce, make derivative works of, publish and/or transmit your User Content in any format and on any platform, either now known or hereinafter invented.

You further grant us a royalty-free license to use your user name, image, voice, and likeness to identify you as the source of any of your User Content; provided, however, that your ability to provide an image, voice, and likeness may be subject to limitations due to age restrictions.

For the avoidance of doubt, the rights granted in the preceding paragraphs of this Section include, but are not limited to, the right to reproduce sound recordings (and make mechanical reproductions of the musical works embodied in such sound recordings), and publicly perform and communicate to the public sound recordings (and the musical works embodied therein), all on a royalty-free basis. This means that you are granting us the right to use your User Content without the obligation to pay royalties to any third party, including, but not limited to, a sound recording copyright owner (e.g., a record label), a musical work copyright owner (e.g., a music publisher), a performing rights organization (e.g., ASCAP, BMI, SESAC, etc.) (a “PRO”), a sound recording PRO (e.g., SoundExchange), any unions or guilds, and engineers, producers or other royalty participants involved in the creation of User Content.

Specific Rules for Musical Works and for Recording Artists. If you are a composer or author of a musical work and are affiliated with a PRO, then you must notify your PRO of the royalty-free license you grant through these Terms in your User Content to us. You are solely responsible for ensuring your compliance with the relevant PRO’s reporting obligations. If you have assigned your rights to a music publisher, then you must obtain the consent of such music publisher to grant the royalty-free license(s) set forth in these Terms in your User Content or have such music publisher enter into these Terms with us. Just because you authored a musical work (e.g., wrote a song) does not mean you have the right to grant us the licenses in these Terms. If you are a recording artist under contract with a record label, then you are solely responsible for ensuring that your use of the Services is in compliance with any contractual obligations you may have to your record label, including if you create any new recordings through the Services that may be claimed by your label.

Through-To-The-Audience Rights. All of the rights you grant in your User Content in these Terms are provided on a through-to-the-audience basis, meaning the owners or operators of third party services will not have any separate liability to you or any other third party for User Content posted or used on such third party service via the Services.

Waiver of Rights to User Content. By posting User Content to or through the Services, you waive any rights to prior inspection or approval of any marketing or promotional materials related to such User Content. You also waive any and all rights of privacy, publicity, or any other rights of a similar nature in connection with your User Content, or any portion thereof. To the extent any moral rights are not transferable or assignable, you hereby waive and agree never to assert any and all moral rights, or to support, maintain or permit any action based on any moral rights that you may have in or with respect to any User Content you Post to or through the Services.

We also have the right to disclose your identity to any third party who is claiming that any User Content posted or uploaded by you to our Services constitutes a violation of their intellectual property rights, or of their right to privacy.

We, or authorised third parties, reserve the right to cut, crop, edit or refuse to publish, your content at our or their sole discretion. We have the right to remove, disallow, block or delete any posting you make on our Services if, in our opinion, your post does not comply with the content standards set out at “Your Access to and Use of Our Services”above. In addition, we have the right – but not the obligation – in our sole discretion to remove, disallow, block or delete any User Content (i) that we consider to violate these Terms, or (ii) in response to complaints from other users or third parties, with or without notice and without any liability to you. As a result, we recommend that you save copies of any User Content that you post to the Services on your personal device(s) in the event that you want to ensure that you have permanent access to copies of such User Content. We do not guarantee the accuracy, integrity, appropriateness or quality of any User Content, and under no circumstances will we be liable in any way for any User Content.

You control whether your User Content is made publicly available on the Services to all other users of the Services or only available to people you approve. To restrict access to your User Content, you should select the privacy setting available within the Platform.

We accept no liability in respect of any content submitted by users and published by us or by authorised third parties.

If you wish to complain about information and materials uploaded by other users please contact us at: feedback@tiktok.com.

TikTok takes reasonable measures to expeditiously remove from our Services any infringing material that we become aware of.It is TikTok’s policy, in appropriate circumstances and at its discretion, to disable or terminate the accounts of users of the Services who repeatedly infringe copyrights or intellectual property rights of others.

While our own staff is continually working to develop and evaluate our own product ideas and features, we pride ourselves on paying close attention to the interests, feedback, comments, and suggestions we receive from the user community. If you choose to contribute by sending us or our employees any ideas for products, services, features, modifications, enhancements, content, refinements, technologies, content offerings (such as audio, visual, games, or other types of content), promotions, strategies, or product/feature names, or any related documentation, artwork, computer code, diagrams, or other materials (collectively “Feedback”), then regardless of what your accompanying communication may say, the following terms will apply, so that future misunderstandings can be avoided. Accordingly, by sending Feedback to us, you agree that:

TikTok has no obligation to review, consider, or implement your Feedback, or to return to you all or part of any Feedback for any reason;

Feedback is provided on a non-confidential basis, and we are not under any obligation to keep any Feedback you send confidential or to refrain from using or disclosing it in any way; and

You irrevocably grant us perpetual and unlimited permission to reproduce, distribute, create derivative works of, modify, publicly perform (including on a through-to-the-audience basis), communicate to the public, make available, publicly display, and otherwise use and exploit the Feedback and derivatives thereof for any purpose and without restriction, free of charge and without attribution of any kind, including by making, using, selling, offering for sale, importing, and promoting commercial products and services that incorporate or embody Feedback, whether in whole or in part, and whether as provided or as modified.'''

In [199]:
terms_EU = '''    Users of the Services may be permitted to upload, post or transmit (such as via a stream) or otherwise make available content through the Services including, without limitation, any text, photographs, user videos, sound recordings and the musical works embodied therein, including videos that incorporate locally stored sound recordings from your personal music library and ambient noise (“User Content”). Users of the Services may also extract all or any portion of User Content created by another user to produce additional User Content, including collaborative User Content with other users, that combine and intersperse User Content generated by more than one user. Users of the Services may also overlay music, graphics, stickers, Virtual Items (as defined and further explained in the “VIRTUAL ITEMS POLICY “) and other elements provided by TikTok (“TikTok Elements”) onto this User Content and transmit this User Content through the Services. The information and materials in the User Content, including User Content that includes TikTok Elements, have not been verified or approved by us. The views expressed by other users on the Services (including through use of the virtual gifts) do not represent our views or values.

    Whenever you access or use a feature that allows you to upload or transmit User Content through the Services (including via certain third party social media platforms such as Instagram, Facebook, YouTube, Twitter), or to make contact with other users of the Services, you must comply with the standards set out at Section 5 above. You may also choose to upload or transmit your User Content, including User Content that includes TikTok Elements, on sites or platforms hosted by third parties. If you decide to do this, you must comply with their content guidelines as well as with the standards set out in this Section 7 above.

    You warrant that any such contribution does comply with those standards, and you will be liable to us and indemnify us for any breach of that warranty. This means you will be responsible for any loss or damage we suffer as a result of your breach of warranty.

    Any User Content will be considered non-confidential and non-proprietary. You must not post any User Content on or through the Services or transmit to us any User Content that you consider to be confidential or proprietary. When you submit User Content through the Services, you agree and represent that you own that User Content, or you have received all necessary permissions, clearances from, or are authorised by, the owner of any part of the content to submit it to the Services, to transmit it from the Services to other third party platforms, and/or adopt any third party content.

    If you only own the rights in and to a sound recording, but not to the underlying musical works embodied in such sound recordings, then you must not post such sound recordings to the Services unless you have all permissions, clearances from, or are authorised by, the owner of any part of the content to submit it to the Services .

    You or the owner of your User Content still own the copyright in User Content sent to us, but by submitting User Content via the Services, you hereby grant us an unconditional irrevocable, non-exclusive, royalty-free, fully transferable, perpetual worldwide licence to use, modify, adapt, reproduce, make derivative works of, publish and/or transmit, and/or distribute and to authorise others users of the Services and other third-parties to view, access, use, download, modify, adapt, reproduce, make derivative works of, publish and/or transmit your User Content in any format and on any platform, either now known or hereinafter invented.

    You further grant us a royalty-free license to use your user name, image, voice, and likeness to identify you as the source of any of your User Content.

    For the avoidance of doubt, the rights granted in the preceding paragraphs of this Section include, but are not limited to, the right to reproduce sound recordings (and make mechanical reproductions of the musical works embodied in such sound recordings), and publicly perform and communicate to the public sound recordings (and the musical works embodied therein), all on a royalty-free basis. This means that you are granting us the right to Use your User Content without the obligation to pay royalties to any third party, including, but not limited to, a sound recording copyright owner ( e.g. , a record label), a musical work copyright owner ( e.g. , a music publisher), a performing rights organization ( e.g. , ASCAP, BMI, SESAC, etc.) (a “ PRO ”), a sound recording PRO (e.g., SoundExchange), any unions or guilds, and engineers, producers or other royalty participants involved in the creation of User Content.

Specific Rules for Musical Works and for Recording Artists. If you are a composer or author of a musical work and are affiliated with a [PRO], then you must notify your PRO of the royalty-free license you grant through these Terms in your User Content to us. You are solely responsible for ensuring your compliance with the relevant PRO’s reporting obligations. If you have assigned your rights to a music publisher, then you must obtain the consent of such music publisher to grant the royalty-free license(s) set forth in these Terms in your User Content or have such music publisher enter into these Terms with us. Just because you authored a musical work (e.g., wrote a song) does not mean you have the right to grant us the licenses in these Terms. If you are a recording artist under contract with a record label, then you are solely responsible for ensuring that your use of the Services is in compliance with any contractual obligations you may have to your record label, including if you create any new recordings through the Services that may be claimed by your label.

Through-To-The-Audience Rights. All of the rights you grant in your User Content in these Terms are provided on a through-to-the-audience basis, meaning the owners or operators of third party services will not have any separate liability to you or any other third party for User Content posted or used on such third party service via the Services.

Waiver of Rights to User Content. By posting User Content to or through the Services, you waive any rights to prior inspection or approval of any marketing or promotional materials related to such User Content. You also waive any and all rights of privacy, publicity, or any other rights of a similar nature in connection with your User Content, or any portion thereof. To the extent any moral rights are not transferable or assignable, you hereby waive and agree never to assert any and all moral rights, or to support, maintain or permit any action based on any moral rights that you may have in or with respect to any User Content you Post to or through the Services.

In certain circumstances, we also have the right to disclose your identity to any third party who is claiming that any User Content posted or uploaded by you to our Services constitutes a violation of their intellectual property rights, or of their right to privacy.

We, or authorised third parties, reserve the right to cut, crop, edit or refuse to publish, your content at our or their sole discretion. We have the right to remove, disallow, block or delete any posting you make on our Platform if, in our opinion, your post does not comply with the content standards set out at Section 5 (Your Use of Our Services) above. In addition, we have the right – but not the obligation – in our sole discretion to remove, disallow, block or delete any User Content (i) that we consider to violate these Terms, or (ii) in response to complaints from other users or third parties, with or without notice and without any liability to you. As a result, we recommend that you save copies of any User Content that you post to the Services on your personal device(s) in the event that you want to ensure that you have permanent access to copies of such User Content. We do not guarantee the accuracy, integrity, appropriateness or quality of any User Content, and under no circumstances will we be liable in any way for any User Content.

You control whether your User Content is made publicly available on the Services to all other users of the Services or only available to people you approve. To change the default access setting for how your User Content is made available to other users, you should select the privacy setting available within the Apps.

We accept no liability in respect of any content submitted by users and published by us or by authorised third parties.

If you wish to complain about information and materials uploaded by other users, please contact us at reports@tiktok.com. 

TikTok takes reasonable measures to expeditiously remove from our Services any infringing material that we become aware of. It is TikTok’s policy, in appropriate circumstances and at its discretion, to disable or terminate the accounts of users of the Services who repeatedly infringe copyrights or intellectual property rights of others.

While our own staff is continually working to develop and evaluate our own product ideas and features, we pride ourselves on paying close attention to the interests, feedback, comments, and suggestions we receive from the user community. If you choose to contribute by sending us or our employees any ideas for products, services, features, modifications, enhancements, content, refinements, technologies, content offerings (such as audio, visual, games, or other types of content), promotions, strategies, or product/feature names, or any related documentation, artwork, computer code, diagrams, or other materials (collectively “Feedback”), then regardless of what your accompanying communication may say, the following terms will apply, so that future misunderstandings can be avoided. Accordingly, by sending Feedback to us, you agree that:

    we have no obligation to review, consider, or implement your Feedback, or to return to you all or part of any Feedback for any reason;

    Feedback is provided on a non-confidential basis, and we are not under any obligation to keep any Feedback you send confidential or to refrain from using or disclosing it in any way;

    and You irrevocably grant us perpetual and unlimited permission to reproduce, distribute, create derivative works of, modify, publicly perform (including on a through-to-the-audience basis), communicate to the public, make available, publicly display, and otherwise use and exploit the Feedback and derivatives thereof for any purpose and without restriction, free of charge and without attribution of any kind, including by making, using, selling, offering for sale, importing, and promoting commercial products and services that incorporate or embody Feedback, whether in whole or in part, and whether as provided or as modified.'''

In [None]:
len(terms_US)

In [133]:
len(terms_EU)

10938

In [None]:
terms_US = terms_US.split('\n')
terms_US = [term for term in terms_US if term != '']

In [134]:
terms_EU = terms_EU.split('\n')

In [135]:
terms_EU = [term for term in terms_EU if term != '']

In [130]:
import pickle
tfidf_model = pickle.load(open('apps/tfidf.sav','rb'))
loaded_model = pickle.load(open('apps/model.sav', 'rb'))
#input_vectorized = tfidf_model.transform(pd.Series(term)).toarray()
#classification = str(loaded_model.predict(input_vectorized))

In [131]:
classification_US = []

for term in terms_US:
    classification_US.append(str(loaded_model.predict(tfidf_model.transform(pd.Series(term)).toarray())))

NameError: name 'terms_US' is not defined

In [214]:
classification_EU = []

for term in terms_EU:
    classification_EU.append(loaded_model.predict(tfidf_model.transform(pd.Series(term)).toarray()))

In [215]:
classification_EU

[array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['bad'], dtype='<U7'),
 array(['b

In [None]:
pd.DataFrame([classification_US,classification_EU],index=['US','EU']).T

In [None]:
import torch

In [None]:
import transformers as ppb

# Neural Networks

In [None]:
import gensim

word2vec_path = "GoogleNews-vectors-negative300.bin.gz"
word2vec = gensim.models.KeyedVectors.load_word2vec_format(word2vec_path, binary=True)

def get_average_word2vec(tokens_list, vector, generate_missing=False, k=300):
    if len(tokens_list)<1:
        return np.zeros(k)
    if generate_missing:
        vectorized = [vector[word] if word in vector else np.random.rand(k) for word in tokens_list]
    else:
        vectorized = [vector[word] if word in vector else np.zeros(k) for word in tokens_list]
    length = len(vectorized)
    summed = np.sum(vectorized, axis=0)
    averaged = np.divide(summed, length)
    return averaged

def get_word2vec_embeddings(vectors, clean_questions, generate_missing=False):
    embeddings = clean_questions['tokens'].apply(lambda x: get_average_word2vec(x, vectors, 
                                                                                generate_missing=generate_missing))
    return list(embeddings)

embeddings = get_word2vec_embeddings(word2vec, clean_questions)
X_train_word2vec, X_test_word2vec, y_train_word2vec, y_test_word2vec = train_test_split(embeddings, list_labels, 
                                                                                        test_size=0.2, random_state=40)

fig = plt.figure(figsize=(16, 16))          
plot_LSA(embeddings, list_labels)
plt.show()



In [120]:
a = {1:'a'}

In [121]:
b = a

In [122]:
c = []

In [123]:
c.append(a)

In [125]:
c.append(b)

In [126]:
c

[{1: 'a'}, {1: 'a'}]

AttributeError: 'list' object has no attribute 'a'