In [88]:
import pandas as pd
import requests
from bs4 import BeautifulSoup
from bs4.element import Comment
import re
import time
from langdetect import detect
import pickle
import numpy as np

In [2]:
from imblearn.over_sampling import RandomOverSampler
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
import nltk



In [3]:
def tag_visible(element):
    if element.parent.name in ['style', 'script', 'head', 'title', 'meta', '[document]']:
        return False
    if isinstance(element, Comment):
        return False
    if element.strip() == "":
        return False
    return True

In [4]:
def fetch_landing_page(url):
    try:
        response = requests.get(url, timeout=(13,17))
        if response:
            html_page = response.text
            soup = BeautifulSoup(html_page, 'html.parser')
            page_texts = soup.findAll(text=True)
            important_texts = filter(tag_visible, page_texts)
            final_site_content = u".".join(x.strip() for x in important_texts)
            title = soup.title
            if title is not None:
                final_site_content = title.text + " " + final_site_content
            return final_site_content
        else:
            print("Invalid Response")
            return None
    except:
        print("Invalid URL/ Timeout")
        return None

## Extracting only useful features and removing null values

In [6]:
data_full = pd.read_csv('Company-Categorization-DFE.csv', encoding = "ISO-8859-1")

In [7]:
data_full.head()

Unnamed: 0,_unit_id,_golden,_canary,_unit_state,_trusted_judgments,_last_judgment_at,business_category,business_category:confidence,description_available,description_available:confidence,...,google1_correct_website,google1_correct_website_found,google1_correct_website_found_gold,google1_correct_website_foundconfidence,google1_correct_website_gold,google1_correct_website_worker_input,google1_correct_website_worker_inputconfidence,google1_correct_websiteconfidence,website,website_match_yn_gold
0,474823173,False,,finalized,3,6/5/14 0:29,Financial Services,1.0,yes,1.0,...,http://central1.com/,yes,,1.0,,/www.central1.com,0.6538,0.6538,,
1,474823174,False,,finalized,3,6/4/14 22:35,Financial Services,0.351,yes,1.0,...,http://ad-agents.com/,yes,,0.64,,ad-agents.com/,0.2304,0.36,http://www.ad-agents.com/kontakt/,
2,474823175,False,,finalized,3,6/4/14 9:46,Other,1.0,yes,1.0,...,http://ad-tech.com/,yes,,1.0,,ad-tech.com,1.0,1.0,,
3,474823176,False,,finalized,3,6/3/14 15:03,Retail,0.6852,yes,1.0,...,http://addrev.com/,yes,,1.0,,addrev.com/,0.6538,1.0,,
4,474823177,False,,finalized,3,6/4/14 10:32,Other,0.6738,yes,1.0,...,http://adotmob.com/,yes,,1.0,,adotmob.com,0.6538,1.0,http://adotmob.com/,


In [8]:
data = data_full[["business_name", "google1_correct_website", "business_category"]]

In [9]:
data.shape

(7335, 3)

In [10]:
data.isna().sum()

business_name               0
google1_correct_website     0
business_category          23
dtype: int64

In [11]:
data = data.dropna()

In [43]:
data.shape

(7312, 3)

In [44]:
data.to_csv("cleaned_data.csv", index=False)

In [45]:
data = pd.read_csv("cleaned_data.csv")

In [46]:
data.head(15)

Unnamed: 0,business_name,google1_correct_website,business_category
0,Central 1,http://central1.com/,Financial Services
1,ad agents GmbH Hamburg - Germany,http://ad-agents.com/,Financial Services
2,ad tech | iMedia,http://ad-tech.com/,Other
3,AddRev,http://addrev.com/,Retail
4,Adotmob,http://adotmob.com/,Other
5,BBC Advertising,http://advertising.bbcworldwide.com/,Other
6,adyard GmbH - Germany,http://adyard.de/,Other
7,airG,http://airg.com/,Other
8,All Star Products Group,http://allstarmg.com/,CPG
9,AlmondNet,http://almondnet.com/,Other


## Domain count distribution

In [12]:
data.apply(lambda x: x.google1_correct_website.split('.')[-1], axis=1).value_counts()

com/                                                     4458
de/                                                       676
uk/                                                       258
org/                                                      221
es/                                                       124
fr/                                                       111
it/                                                        99
net/                                                       88
edu/                                                       81
ch/                                                        65
br/                                                        57
html                                                       48
at/                                                        45
ie/                                                        42
aspx                                                       40
pl/                                                        34
com/en/ 

## Training only on English sites

In [105]:
english_domains_filter = data.apply(lambda x: True
                                   if (x.google1_correct_website.split('.')[-1]=='com/' or
                                   x.google1_correct_website.split('.')[-1]=='uk/' or
                                   x.google1_correct_website.split('.')[-1]=='au/' or
                                   x.google1_correct_website.split('.')[-1]=='gov/' or
                                   x.google1_correct_website.split('.')[-1]=='net/' or
                                   x.google1_correct_website.split('.')[-1]=='edu/' or
                                   x.google1_correct_website.split('.')[-1]=='html' or
                                   x.google1_correct_website.split('.')[-1]=='com/en') and
                                   (x.business_name.lower().find('gmbh') == -1)
                                   else False, axis=1)

In [106]:
data_english = data[english_domains_filter]

In [13]:
data_english.to_csv('data_english_sites.csv', index=False)

In [4]:
data_english = pd.read_csv('data_english_sites.csv')

In [5]:
data_english.head()

Unnamed: 0,business_name,google1_correct_website,business_category,site_content
0,Central 1,http://central1.com/,Financial Services,"Search for:.Join our team, we’re growing..VIEW..."
1,ad tech | iMedia,http://ad-tech.com/,Other,
2,AddRev,http://addrev.com/,Retail,
3,Adotmob,http://adotmob.com/,Other,
4,BBC Advertising,http://advertising.bbcworldwide.com/,Other,


In [6]:
data_english["business_category"].value_counts()

Other                 2562
Retail                1011
Financial Services     432
CPG                    335
Travel                 331
Auto                   162
Name: business_category, dtype: int64

In [7]:
data_english['site_content'] = None

In [10]:
data_english.head()

Unnamed: 0,business_name,google1_correct_website,business_category,site_content
0,Central 1,http://central1.com/,Financial Services,
1,ad tech | iMedia,http://ad-tech.com/,Other,
2,AddRev,http://addrev.com/,Retail,
3,Adotmob,http://adotmob.com/,Other,
4,BBC Advertising,http://advertising.bbcworldwide.com/,Other,


In [13]:
tic = time.time()
for i in range(300, 1200):
    if (i%50==0):
        toc = time.time()
        print("Time taken for 50 rows:", toc - tic)
        data_english.to_csv('english_sites_with_content.csv', index=False)
        tic = time.time()
    print("row: ", i)
    url = data_english.iloc[i]["google1_correct_website"]
    data_english.iloc[i]["site_content"] = fetch_landing_page(url)

Time taken for 50 rows: 0.0008819103240966797
row:  300
row:  301
row:  302
row:  303
row:  304
row:  305
Invalid URL/ Timeout
row:  306
row:  307
row:  308
row:  309
row:  310
Invalid URL/ Timeout
row:  311
row:  312
row:  313
Invalid Response
row:  314
row:  315
Invalid Response
row:  316
Invalid Response
row:  317
row:  318
Invalid Response
row:  319
row:  320
row:  321
row:  322
row:  323
Invalid Response
row:  324
row:  325
row:  326
row:  327
row:  328
row:  329
row:  330
row:  331
row:  332
row:  333
row:  334
row:  335
row:  336
row:  337
row:  338
Invalid Response
row:  339
row:  340
row:  341
row:  342
row:  343
row:  344
row:  345
row:  346
Invalid Response
row:  347
row:  348
row:  349
Invalid URL/ Timeout
Time taken for 50 rows: 436.75583148002625
row:  350
Invalid URL/ Timeout
row:  351
row:  352
row:  353
row:  354
row:  355
row:  356
row:  357
row:  358
Invalid URL/ Timeout
row:  359
row:  360
Invalid URL/ Timeout
row:  361
row:  362
row:  363
row:  364
row:  365
row:  

Invalid Response
row:  855
row:  856
Invalid URL/ Timeout
row:  857
row:  858
Invalid URL/ Timeout
row:  859
row:  860
row:  861
row:  862
row:  863
row:  864
row:  865
Invalid URL/ Timeout
row:  866
Invalid URL/ Timeout
row:  867
Invalid Response
row:  868
row:  869
row:  870
row:  871
row:  872
row:  873
row:  874
row:  875
row:  876
row:  877
row:  878
row:  879
row:  880
row:  881
row:  882
row:  883
row:  884
row:  885
row:  886
row:  887
row:  888
row:  889
row:  890
row:  891
Invalid Response
row:  892
row:  893
row:  894
row:  895
row:  896
Invalid URL/ Timeout
row:  897
row:  898
Invalid Response
row:  899
Time taken for 50 rows: 191.27922201156616
row:  900
Invalid URL/ Timeout
row:  901
Invalid Response
row:  902
row:  903
row:  904
row:  905
row:  906
row:  907
row:  908
row:  909
row:  910
row:  911
row:  912
row:  913
row:  914
row:  915
row:  916
row:  917
row:  918
Invalid URL/ Timeout
row:  919
row:  920
Invalid URL/ Timeout
row:  921
row:  922
row:  923
row:  924
row:

In [17]:
data_english = data_english[["business_name", "google1_correct_website", "business_category", "site_content"
]]

Unnamed: 0,business_name,google1_correct_website,business_category,site_content
0,Central 1,http://central1.com/,Financial Services,"Search for:.Join our team, we’re growing..VIEW..."
1,ad tech | iMedia,http://ad-tech.com/,Other,
2,AddRev,http://addrev.com/,Retail,
3,Adotmob,http://adotmob.com/,Other,
4,BBC Advertising,http://advertising.bbcworldwide.com/,Other,


In [14]:
data_english.to_csv('english_sites_with_content.csv', index=False)

In [23]:
subdata = data_english[:1199].dropna()

In [24]:
tfidf = TfidfVectorizer(stop_words='english')
tfidf_vector = tfidf.fit_transform(subdata["site_content"])

In [26]:
X_tfidf = tfidf_vector.toarray()

In [27]:
X_tfidf.shape

(959, 43539)

In [33]:
le = LabelEncoder()
y_tfidf = le.fit_transform(subdata['business_category'])

In [34]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_tfidf, test_size=0.3, random_state=42)

In [35]:
lr = LogisticRegression(random_state=42)
lr.fit(X_train, y_train)

LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
          intercept_scaling=1, max_iter=100, multi_class='ovr', n_jobs=1,
          penalty='l2', random_state=42, solver='liblinear', tol=0.0001,
          verbose=0, warm_start=False)

In [36]:
lr.score(X_test, y_test)

0.5625

## Change of Approach: Labelling Languages and training separate classifiers

In [47]:
url = data.iloc[1]["google1_correct_website"]

In [48]:
text = fetch_landing_page(url)

In [49]:
from langdetect import detect

In [50]:
detect(text)

'de'

In [53]:
data['site-content'] = None
data['site-language'] = None
data.head()

Unnamed: 0,business_name,google1_correct_website,business_category,site-content,site-language
0,Central 1,http://central1.com/,Financial Services,,
1,ad agents GmbH Hamburg - Germany,http://ad-agents.com/,Financial Services,,
2,ad tech | iMedia,http://ad-tech.com/,Other,,
3,AddRev,http://addrev.com/,Retail,,
4,Adotmob,http://adotmob.com/,Other,,


In [54]:
data.to_csv('data_with_lang_labels.csv')

In [104]:
tic = time.time()
for i in range(4500, data.shape[0]):
    if (i%100==0):
        toc = time.time()
        print("Time taken for 100 rows:", toc - tic)
        data.to_csv('data_with_lang_labels.csv', index=False)
        tic = time.time()
    print("row: ", i)
    url = data.iloc[i]["google1_correct_website"]
    content = fetch_landing_page(url)
    if (content is not None) and (content != ''):
        data.at[i, "site-content"] = content
        try:
            lang = detect(content)    
            data.at[i,"site-language"] = lang
        except:
            continue
    

Time taken for 100 rows: 0.00043892860412597656
row:  4500
row:  4501
row:  4502
row:  4503
row:  4504
row:  4505
row:  4506
row:  4507
Invalid URL/ Timeout
row:  4508
row:  4509
row:  4510
Invalid URL/ Timeout
row:  4511
row:  4512
row:  4513
row:  4514
row:  4515
row:  4516
row:  4517
row:  4518
Invalid URL/ Timeout
row:  4519
row:  4520


ERROR:urllib3.connection:Certificate did not match expected hostname: wavecloud.com. Certificate: {'subject': ((('jurisdictionCountryName', 'US'),), (('jurisdictionStateOrProvinceName', 'Arizona'),), (('businessCategory', 'Private Organization'),), (('serialNumber', 'R17247303'),), (('countryName', 'US'),), (('stateOrProvinceName', 'Arizona'),), (('localityName', 'Scottsdale'),), (('organizationName', 'Special Domain Services, LLC'),), (('commonName', 'shortener.secureserver.net'),)), 'issuer': ((('countryName', 'US'),), (('stateOrProvinceName', 'Arizona'),), (('localityName', 'Scottsdale'),), (('organizationName', 'Starfield Technologies, Inc.'),), (('organizationalUnitName', 'http://certs.starfieldtech.com/repository/'),), (('commonName', 'Starfield Secure Certificate Authority - G2'),)), 'version': 3, 'serialNumber': 'EF5927289A9FDADB', 'notBefore': 'Sep 26 22:40:51 2018 GMT', 'notAfter': 'Sep 26 22:40:51 2020 GMT', 'subjectAltName': (('DNS', 'shortener.secureserver.net'), ('DNS', '

Invalid URL/ Timeout
row:  4521
Invalid URL/ Timeout
row:  4522
row:  4523
row:  4524
row:  4525
row:  4526
row:  4527
row:  4528
row:  4529
row:  4530
Invalid URL/ Timeout
row:  4531
row:  4532
row:  4533
row:  4534
Invalid Response
row:  4535
Invalid URL/ Timeout
row:  4536
row:  4537
row:  4538
row:  4539
row:  4540
row:  4541
row:  4542
row:  4543
row:  4544
row:  4545
Invalid URL/ Timeout
row:  4546
row:  4547
row:  4548
Invalid Response
row:  4549
row:  4550
row:  4551
row:  4552
row:  4553
Invalid Response
row:  4554
row:  4555
row:  4556
row:  4557
row:  4558
row:  4559
row:  4560
row:  4561
Invalid Response
row:  4562
row:  4563
row:  4564
Invalid URL/ Timeout
row:  4565
Invalid Response
row:  4566
Invalid URL/ Timeout
row:  4567
row:  4568
Invalid URL/ Timeout
row:  4569
Invalid Response
row:  4570
row:  4571
Invalid Response
row:  4572
row:  4573
Invalid URL/ Timeout
row:  4574
row:  4575
row:  4576
row:  4577
row:  4578
row:  4579
row:  4580


Traceback (most recent call last):
  File "/home/ubuntu/miniconda/envs/cv/lib/python3.6/site-packages/urllib3/connectionpool.py", line 396, in _make_request
    assert_header_parsing(httplib_response.msg)
  File "/home/ubuntu/miniconda/envs/cv/lib/python3.6/site-packages/urllib3/util/response.py", line 66, in assert_header_parsing
    raise HeaderParsingError(defects=defects, unparsed_data=unparsed_data)
urllib3.exceptions.HeaderParsingError: [MissingHeaderBodySeparatorDefect()], unparsed data: '/\r\nServer: BigIP\r\nConnection: Keep-Alive\r\nContent-Length: 0\r\n\r\n'


Invalid URL/ Timeout
row:  4581
row:  4582
row:  4583
Invalid URL/ Timeout
row:  4584
row:  4585
row:  4586
Invalid Response
row:  4587
row:  4588
row:  4589
Invalid Response
row:  4590
Invalid Response
row:  4591
Invalid Response
row:  4592
row:  4593
row:  4594
Invalid Response
row:  4595
Invalid URL/ Timeout
row:  4596
row:  4597
Invalid URL/ Timeout
row:  4598
row:  4599
Time taken for 100 rows: 432.5011909008026
row:  4600
row:  4601
Invalid URL/ Timeout
row:  4602
row:  4603
row:  4604
row:  4605
row:  4606
row:  4607
row:  4608
row:  4609
row:  4610
row:  4611
row:  4612
row:  4613
row:  4614
row:  4615
row:  4616
row:  4617
row:  4618
row:  4619
row:  4620
row:  4621
row:  4622
Invalid URL/ Timeout
row:  4623
row:  4624
row:  4625
row:  4626
Invalid URL/ Timeout
row:  4627
Invalid Response
row:  4628
row:  4629
row:  4630
row:  4631
row:  4632
row:  4633
row:  4634
row:  4635
row:  4636
row:  4637
Invalid Response
row:  4638
Invalid URL/ Timeout
row:  4639
row:  4640
row:  4641

ERROR:urllib3.connection:Certificate did not match expected hostname: alconemarketing.com. Certificate: {'subject': ((('countryName', 'US'),), (('postalCode', '75063'),), (('stateOrProvinceName', 'Texas'),), (('localityName', 'Irving'),), (('streetAddress', '7850 N. Belt Line Rd'),), (('organizationName', 'Consolidated Data Services'),), (('organizationalUnitName', 'Consolidated Data Services'),), (('organizationalUnitName', 'Enterprise SSL Wildcard'),), (('commonName', '*.alcone.com'),)), 'issuer': ((('countryName', 'US'),), (('stateOrProvinceName', 'DE'),), (('localityName', 'Wilmington'),), (('organizationName', 'Corporation Service Company'),), (('commonName', 'Trusted Secure Certificate Authority 5'),)), 'version': 3, 'serialNumber': '2B51A34D3361A2FCF169793BC1499A05', 'notBefore': 'Nov 21 00:00:00 2018 GMT', 'notAfter': 'Nov 20 23:59:59 2020 GMT', 'subjectAltName': (('DNS', '*.alcone.com'), ('DNS', 'alcone.com')), 'OCSP': ('http://ocsp.usertrust.com',), 'caIssuers': ('http://crt.

Invalid URL/ Timeout
row:  4824
row:  4825


ERROR:urllib3.connection:Certificate did not match expected hostname: alenty.com. Certificate: {'subject': ((('countryName', 'US'),), (('stateOrProvinceName', 'New York'),), (('localityName', 'New York'),), (('organizationName', 'AppNexus, Inc.'),), (('commonName', 'appnexus.com'),)), 'issuer': ((('countryName', 'US'),), (('organizationName', 'DigiCert Inc'),), (('organizationalUnitName', 'www.digicert.com'),), (('commonName', 'GeoTrust TLS RSA CA G1'),)), 'version': 3, 'serialNumber': '05A83E78D75842AA755AFA5920BD8CC5', 'notBefore': 'Sep  5 00:00:00 2018 GMT', 'notAfter': 'Oct  5 12:00:00 2019 GMT', 'subjectAltName': (('DNS', 'appnexus.com'), ('DNS', 'www.appnexus.com'), ('DNS', 'www.appnexus.net'), ('DNS', 'appnexus.net')), 'OCSP': ('http://status.geotrust.com',), 'caIssuers': ('http://cacerts.geotrust.com/GeoTrustTLSRSACAG1.crt',), 'crlDistributionPoints': ('http://cdp.geotrust.com/GeoTrustTLSRSACAG1.crl',)}


Invalid URL/ Timeout
row:  4826
Invalid URL/ Timeout
row:  4827
Invalid URL/ Timeout
row:  4828
row:  4829
Invalid Response
row:  4830
row:  4831
row:  4832
row:  4833
row:  4834


ERROR:urllib3.connection:Certificate did not match expected hostname: allianzdeutschland.de. Certificate: {'subject': ((('countryName', 'DE'),), (('stateOrProvinceName', 'Bayern'),), (('localityName', 'Muenchen'),), (('organizationName', 'Allianz Technology SE'),), (('organizationalUnitName', 'Deutschland AG'),), (('commonName', 'www.allianzdeutschland.de'),)), 'issuer': ((('countryName', 'BM'),), (('organizationName', 'QuoVadis Limited'),), (('commonName', 'QuoVadis Global SSL ICA G3'),)), 'version': 3, 'serialNumber': '0D8009FC96416A2664F259315B9706DD7B2E6150', 'notBefore': 'Aug 14 06:01:37 2018 GMT', 'notAfter': 'Aug 14 06:10:00 2020 GMT', 'subjectAltName': (('DNS', 'www.allianzdeutschland.de'),), 'OCSP': ('http://ocsp.quovadisglobal.com',), 'caIssuers': ('http://trust.quovadisglobal.com/qvsslg3.crt',), 'crlDistributionPoints': ('http://crl.quovadisglobal.com/qvsslg3.crl',)}


Invalid URL/ Timeout
row:  4835
row:  4836
row:  4837
row:  4838
row:  4839
Invalid URL/ Timeout
row:  4840
row:  4841
row:  4842
row:  4843
row:  4844
row:  4845
Invalid Response
row:  4846
Invalid Response
row:  4847
Invalid Response
row:  4848
row:  4849
row:  4850
Invalid URL/ Timeout
row:  4851
row:  4852
row:  4853
row:  4854
row:  4855
Invalid URL/ Timeout
row:  4856
row:  4857
row:  4858
row:  4859
row:  4860
Invalid URL/ Timeout
row:  4861
Invalid URL/ Timeout
row:  4862
row:  4863
row:  4864
row:  4865
row:  4866
row:  4867
row:  4868
row:  4869
row:  4870
row:  4871
row:  4872
row:  4873
row:  4874
row:  4875
row:  4876
row:  4877
row:  4878
row:  4879
row:  4880
row:  4881
row:  4882
row:  4883
row:  4884
Invalid Response
row:  4885
row:  4886
row:  4887
row:  4888
row:  4889
row:  4890
row:  4891
Invalid URL/ Timeout
row:  4892
Invalid URL/ Timeout
row:  4893
row:  4894
row:  4895
row:  4896
row:  4897
row:  4898
row:  4899
Time taken for 100 rows: 427.1248652935028
row:  

ERROR:urllib3.connection:Certificate did not match expected hostname: bluekettle.ca. Certificate: {'subject': ((('commonName', 'www.bluekettle.ca'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', "Let's Encrypt Authority X3"),)), 'version': 3, 'serialNumber': '04B4940A74015EDDCF0811A733EDF2AAEE27', 'notBefore': 'May  9 13:57:43 2019 GMT', 'notAfter': 'Aug  7 13:57:43 2019 GMT', 'subjectAltName': (('DNS', 'www.bluekettle.ca'),), 'OCSP': ('http://ocsp.int-x3.letsencrypt.org',), 'caIssuers': ('http://cert.int-x3.letsencrypt.org/',)}


Invalid URL/ Timeout
row:  5017
row:  5018
row:  5019
row:  5020
row:  5021
row:  5022
Invalid Response
row:  5023
row:  5024
row:  5025
row:  5026
row:  5027
row:  5028
row:  5029
Invalid Response
row:  5030
row:  5031
row:  5032
row:  5033
row:  5034
row:  5035
row:  5036
row:  5037
row:  5038
row:  5039
row:  5040
row:  5041
row:  5042
Invalid Response
row:  5043
row:  5044
row:  5045
row:  5046
row:  5047
Invalid URL/ Timeout
row:  5048
row:  5049
row:  5050
row:  5051
row:  5052
Invalid URL/ Timeout
row:  5053
row:  5054
row:  5055
Invalid Response
row:  5056
row:  5057
row:  5058
Invalid Response
row:  5059
row:  5060
row:  5061
row:  5062
Invalid Response
row:  5063
Invalid Response
row:  5064
row:  5065
row:  5066
row:  5067
Invalid URL/ Timeout
row:  5068
row:  5069
row:  5070
row:  5071
row:  5072
row:  5073
row:  5074
row:  5075
row:  5076
row:  5077
Invalid URL/ Timeout
row:  5078
row:  5079
row:  5080
row:  5081
row:  5082
row:  5083
row:  5084
Invalid Response
row:  5085


ERROR:urllib3.connection:Certificate did not match expected hostname: envisionitmedia.com. Certificate: {'subject': ((('commonName', 'destinationenv.com'),),), 'issuer': ((('countryName', 'US'),), (('organizationName', "Let's Encrypt"),), (('commonName', "Let's Encrypt Authority X3"),)), 'version': 3, 'serialNumber': '033DBB90ADDD04BBDD50C89D5EDEDDF40F89', 'notBefore': 'May 23 20:21:49 2019 GMT', 'notAfter': 'Aug 21 20:21:49 2019 GMT', 'subjectAltName': (('DNS', 'destinationenv.com'), ('DNS', 'www.destinationenv.com')), 'OCSP': ('http://ocsp.int-x3.letsencrypt.org',), 'caIssuers': ('http://cert.int-x3.letsencrypt.org/',)}


Invalid URL/ Timeout
row:  5195
row:  5196
Invalid Response
row:  5197
row:  5198
row:  5199
Time taken for 100 rows: 437.96310806274414
row:  5200
row:  5201
row:  5202
row:  5203
row:  5204
row:  5205
row:  5206
row:  5207
Invalid Response
row:  5208
row:  5209
Invalid Response
row:  5210
row:  5211
Invalid URL/ Timeout
row:  5212
row:  5213
row:  5214
row:  5215
Invalid URL/ Timeout
row:  5216
row:  5217
Invalid Response
row:  5218
row:  5219
row:  5220
row:  5221
row:  5222
row:  5223
row:  5224
row:  5225
Invalid URL/ Timeout
row:  5226
row:  5227
row:  5228
row:  5229
row:  5230
row:  5231
Invalid Response
row:  5232
row:  5233
row:  5234
row:  5235
row:  5236
row:  5237
row:  5238
Invalid Response
row:  5239
row:  5240
Invalid Response
row:  5241
Invalid Response
row:  5242
row:  5243
row:  5244
row:  5245
row:  5246
row:  5247
row:  5248
row:  5249
row:  5250
row:  5251
row:  5252
row:  5253
row:  5254
row:  5255
row:  5256
row:  5257
row:  5258
row:  5259
row:  5260
row:  5261

ERROR:urllib3.connection:Certificate did not match expected hostname: dpdhl.com. Certificate: {'subject': ((('countryName', 'DE'),), (('stateOrProvinceName', 'Nordrhein-Westfalen'),), (('localityName', 'Bonn'),), (('organizationName', 'Deutsche Post AG'),), (('commonName', 'origin.dpdhl.com'),)), 'issuer': ((('countryName', 'BE'),), (('organizationName', 'GlobalSign nv-sa'),), (('commonName', 'GlobalSign Organization Validation CA - SHA256 - G2'),)), 'version': 3, 'serialNumber': '2B21198463A9F5ABB76F224B', 'notBefore': 'Jan 24 08:51:04 2019 GMT', 'notAfter': 'Jan 25 08:51:04 2020 GMT', 'subjectAltName': (('DNS', 'origin.dpdhl.com'), ('DNS', 'annualreport2015.dpdhl.com'), ('DNS', 'annualreport2016.dpdhl.com'), ('DNS', 'annualreport2017.dpdhl.com'), ('DNS', 'geschaeftsbericht2015.dpdhl.com'), ('DNS', 'geschaeftsbericht2016.dpdhl.com'), ('DNS', 'geschaeftsbericht2017.dpdhl.com'), ('DNS', 'geschaeftsbericht2018.dpdhl.com'), ('DNS', 'origin.test.dpdhl.com')), 'OCSP': ('http://ocsp2.globals

Invalid URL/ Timeout
row:  5309
row:  5310
row:  5311
Invalid URL/ Timeout
row:  5312
Invalid URL/ Timeout
row:  5313
row:  5314
row:  5315
Invalid URL/ Timeout
row:  5316
row:  5317
row:  5318
row:  5319
Invalid URL/ Timeout
row:  5320
Invalid Response
row:  5321
row:  5322
Invalid Response
row:  5323
row:  5324
row:  5325
row:  5326
Invalid Response
row:  5327
row:  5328
Invalid URL/ Timeout
row:  5329
Invalid URL/ Timeout
row:  5330
Invalid URL/ Timeout
row:  5331
row:  5332
row:  5333
row:  5334
row:  5335
row:  5336
row:  5337
row:  5338
Invalid Response
row:  5339
row:  5340
row:  5341
row:  5342
row:  5343
row:  5344
row:  5345
row:  5346
Invalid Response
row:  5347
row:  5348
row:  5349
row:  5350
row:  5351
row:  5352
row:  5353
row:  5354
Invalid URL/ Timeout
row:  5355
row:  5356
Invalid URL/ Timeout
row:  5357
row:  5358
row:  5359
row:  5360
row:  5361
row:  5362
Invalid Response
row:  5363
row:  5364
Invalid URL/ Timeout
row:  5365
row:  5366
Invalid URL/ Timeout
row:  53

row:  5807
row:  5808
row:  5809
row:  5810
row:  5811
Invalid URL/ Timeout
row:  5812
row:  5813
Invalid Response
row:  5814
row:  5815
row:  5816
row:  5817
row:  5818
row:  5819
row:  5820
row:  5821
row:  5822
row:  5823
Invalid Response
row:  5824
row:  5825
row:  5826
row:  5827
row:  5828
Invalid URL/ Timeout
row:  5829
row:  5830
row:  5831
Invalid Response
row:  5832
row:  5833
row:  5834
row:  5835
row:  5836
row:  5837
Invalid URL/ Timeout
row:  5838
row:  5839
row:  5840
row:  5841
row:  5842
row:  5843
row:  5844
row:  5845
row:  5846
row:  5847
row:  5848
row:  5849
row:  5850
Invalid Response
row:  5851
row:  5852
row:  5853
row:  5854
row:  5855
row:  5856
row:  5857
Invalid Response
row:  5858
row:  5859
row:  5860
row:  5861
row:  5862
row:  5863
row:  5864
row:  5865
row:  5866
row:  5867
row:  5868
row:  5869
row:  5870
row:  5871
row:  5872
Invalid URL/ Timeout
row:  5873
row:  5874
row:  5875
row:  5876
row:  5877
row:  5878
Invalid URL/ Timeout
row:  5879
row:  5

ERROR:urllib3.connection:Certificate did not match expected hostname: marinsoftware.co.uk. Certificate: {'subject': ((('countryName', 'US'),), (('stateOrProvinceName', 'California'),), (('localityName', 'San Francisco'),), (('organizationName', 'Marin Software Inc.'),), (('commonName', '*.marinsoftware.com'),)), 'issuer': ((('countryName', 'US'),), (('organizationName', 'DigiCert Inc'),), (('commonName', 'DigiCert SHA2 Secure Server CA'),)), 'version': 3, 'serialNumber': '055F677DE2E12350DFF2F1F5D65FC9AA', 'notBefore': 'Jun 18 00:00:00 2018 GMT', 'notAfter': 'Jul 13 12:00:00 2020 GMT', 'subjectAltName': (('DNS', '*.marinsoftware.com'), ('DNS', 'marinsoftware.com'), ('DNS', 'app.marinsoftware.com'), ('DNS', 'www.marinsoftware.com'), ('DNS', 'pro.marinsoftware.com')), 'OCSP': ('http://ocsp.digicert.com',), 'caIssuers': ('http://cacerts.digicert.com/DigiCertSHA2SecureServerCA.crt',), 'crlDistributionPoints': ('http://crl3.digicert.com/ssca-sha2-g6.crl', 'http://crl4.digicert.com/ssca-sha2

Invalid URL/ Timeout
row:  6006
row:  6007
Invalid URL/ Timeout
row:  6008
row:  6009
row:  6010
row:  6011
row:  6012
row:  6013
row:  6014
row:  6015
row:  6016
row:  6017
row:  6018
row:  6019
row:  6020
row:  6021
row:  6022
Invalid Response
row:  6023
row:  6024
row:  6025
row:  6026
row:  6027
row:  6028
row:  6029
row:  6030
row:  6031
row:  6032
row:  6033
row:  6034
row:  6035
row:  6036
row:  6037
row:  6038
row:  6039
row:  6040
row:  6041
row:  6042
row:  6043
row:  6044
row:  6045
row:  6046
Invalid URL/ Timeout
row:  6047
row:  6048
Invalid URL/ Timeout
row:  6049
row:  6050
row:  6051
row:  6052
row:  6053
row:  6054
row:  6055
row:  6056
row:  6057
row:  6058
row:  6059
row:  6060
row:  6061
Invalid Response
row:  6062
row:  6063


ERROR:urllib3.connection:Certificate did not match expected hostname: monetizeit.com. Certificate: {'subject': ((('countryName', 'DE'),), (('stateOrProvinceName', 'Bayern'),), (('localityName', 'München'),), (('organizationName', 'Team Internet AG'),), (('commonName', '*.parkingcrew.net'),)), 'issuer': ((('countryName', 'US'),), (('organizationName', 'DigiCert Inc'),), (('organizationalUnitName', 'www.digicert.com'),), (('commonName', 'Thawte TLS RSA CA G1'),)), 'version': 3, 'serialNumber': '017129E41208E5BA4A34454A044BF9B2', 'notBefore': 'Jul 25 00:00:00 2018 GMT', 'notAfter': 'Jul 24 12:00:00 2020 GMT', 'subjectAltName': (('DNS', '*.parkingcrew.net'), ('DNS', 'parkingcrew.net')), 'OCSP': ('http://status.thawte.com',), 'caIssuers': ('http://cacerts.thawte.com/ThawteTLSRSACAG1.crt',), 'crlDistributionPoints': ('http://cdp.thawte.com/ThawteTLSRSACAG1.crl',)}


Invalid URL/ Timeout
row:  6064
row:  6065
row:  6066
row:  6067
row:  6068
Invalid URL/ Timeout
row:  6069
row:  6070
row:  6071
row:  6072
row:  6073
row:  6074
row:  6075
row:  6076
row:  6077
Invalid URL/ Timeout
row:  6078
row:  6079
Invalid URL/ Timeout
row:  6080
row:  6081
row:  6082
row:  6083
row:  6084
row:  6085
Invalid URL/ Timeout
row:  6086
row:  6087
row:  6088
row:  6089
row:  6090
row:  6091
row:  6092
row:  6093
row:  6094
row:  6095
row:  6096
row:  6097
row:  6098
row:  6099
Time taken for 100 rows: 369.3452343940735
row:  6100
row:  6101
row:  6102
row:  6103
row:  6104
row:  6105
row:  6106
row:  6107
row:  6108
row:  6109
row:  6110
row:  6111
Invalid Response
row:  6112
row:  6113
row:  6114
row:  6115
row:  6116
row:  6117
row:  6118
row:  6119
row:  6120
row:  6121
row:  6122
row:  6123
row:  6124
row:  6125
row:  6126
Invalid Response
row:  6127
row:  6128
row:  6129
row:  6130
Invalid URL/ Timeout
row:  6131
row:  6132
Invalid URL/ Timeout
row:  6133
row:  

ERROR:urllib3.connection:Certificate did not match expected hostname: piaggio.com. Certificate: {'subject': ((('countryName', 'IT'),), (('stateOrProvinceName', 'Pisa'),), (('localityName', 'Pontedera'),), (('organizationName', 'Piaggio & C. S.p.A.'),), (('organizationalUnitName', 'IT'),), (('commonName', '*.piaggiogroup.com'),)), 'issuer': ((('countryName', 'US'),), (('organizationName', 'DigiCert Inc'),), (('organizationalUnitName', 'www.digicert.com'),), (('commonName', 'Thawte RSA CA 2018'),)), 'version': 3, 'serialNumber': '05F938DA1762C9944FFBB834C9553EB0', 'notBefore': 'Feb 11 00:00:00 2019 GMT', 'notAfter': 'Feb 11 12:00:00 2020 GMT', 'subjectAltName': (('DNS', '*.piaggiogroup.com'), ('DNS', 'piaggiogroup.com')), 'OCSP': ('http://status.thawte.com',), 'caIssuers': ('http://cacerts.thawte.com/ThawteRSACA2018.crt',), 'crlDistributionPoints': ('http://cdp.thawte.com/ThawteRSACA2018.crl',)}


Invalid URL/ Timeout
row:  6380
row:  6381
row:  6382
row:  6383
row:  6384
row:  6385
row:  6386
row:  6387
row:  6388
row:  6389
row:  6390
row:  6391
row:  6392
row:  6393
row:  6394
row:  6395
row:  6396
row:  6397
Invalid URL/ Timeout
row:  6398
row:  6399
Time taken for 100 rows: 554.421350479126
row:  6400
row:  6401
row:  6402
row:  6403
row:  6404
row:  6405
row:  6406
row:  6407
row:  6408
row:  6409
row:  6410
row:  6411
row:  6412
Invalid Response
row:  6413
Invalid Response
row:  6414
row:  6415
Invalid Response
row:  6416
row:  6417
row:  6418
row:  6419
row:  6420
row:  6421
row:  6422
row:  6423
row:  6424
Invalid Response
row:  6425
Invalid Response
row:  6426
Invalid URL/ Timeout
row:  6427
Invalid URL/ Timeout
row:  6428
Invalid URL/ Timeout
row:  6429
Invalid URL/ Timeout
row:  6430
row:  6431
row:  6432
row:  6433
row:  6434
row:  6435
row:  6436
Invalid Response
row:  6437
Invalid URL/ Timeout
row:  6438
row:  6439
row:  6440
Invalid URL/ Timeout
row:  6441
row:  

row:  6904
row:  6905
Invalid Response
row:  6906
row:  6907
row:  6908
row:  6909
row:  6910
row:  6911
Invalid Response
row:  6912
row:  6913
row:  6914
row:  6915
row:  6916
row:  6917
row:  6918
row:  6919
Invalid URL/ Timeout
row:  6920
row:  6921
row:  6922
row:  6923
row:  6924
row:  6925
row:  6926
row:  6927
row:  6928
row:  6929
row:  6930
Invalid URL/ Timeout
row:  6931
row:  6932
Invalid Response
row:  6933
Invalid Response
row:  6934
row:  6935
Invalid URL/ Timeout
row:  6936
row:  6937
row:  6938
row:  6939
row:  6940
row:  6941
row:  6942
row:  6943
row:  6944
row:  6945
row:  6946
row:  6947
row:  6948
row:  6949
row:  6950
row:  6951
row:  6952
row:  6953
Invalid URL/ Timeout
row:  6954
row:  6955
row:  6956
row:  6957
row:  6958
row:  6959
Invalid Response
row:  6960
row:  6961
row:  6962
row:  6963
row:  6964
row:  6965
row:  6966
row:  6967
Invalid URL/ Timeout
row:  6968
row:  6969
row:  6970
row:  6971
row:  6972
row:  6973
row:  6974
row:  6975
row:  6976
row:  6

ERROR:urllib3.connection:Certificate did not match expected hostname: voyage-prive.it. Certificate: {'subject': ((('serialNumber', '479 345 043 00034'),), (('jurisdictionCountryName', 'FR'),), (('jurisdictionStateOrProvinceName', 'PACA'),), (('jurisdictionLocalityName', 'Aix en Provence'),), (('businessCategory', 'Private Organization'),), (('countryName', 'FR'),), (('postalCode', '13100'),), (('stateOrProvinceName', 'PACA'),), (('localityName', 'Aix en Provence'),), (('streetAddress', '684 Avenue du Club Hippique'),), (('organizationName', 'VOYAGE PRIVE (VPG)'),), (('organizationalUnitName', 'COMODO EV Multi-Domain SSL'),), (('commonName', 'www.voyage-prive.com'),)), 'issuer': ((('countryName', 'GB'),), (('stateOrProvinceName', 'Greater Manchester'),), (('localityName', 'Salford'),), (('organizationName', 'COMODO CA Limited'),), (('commonName', 'COMODO RSA Extended Validation Secure Server CA'),)), 'version': 3, 'serialNumber': '627C0DE0401FEF7A51A3986EA5230260', 'notBefore': 'Jun  6 

Invalid URL/ Timeout
row:  7004
row:  7005
row:  7006
row:  7007
Invalid Response
row:  7008
row:  7009
row:  7010
row:  7011
row:  7012
row:  7013
row:  7014
Invalid Response
row:  7015
row:  7016
row:  7017
row:  7018
row:  7019
row:  7020
row:  7021
row:  7022
Invalid URL/ Timeout
row:  7023
row:  7024
row:  7025
row:  7026
Invalid Response
row:  7027
row:  7028
row:  7029
row:  7030
row:  7031
row:  7032
row:  7033
row:  7034
row:  7035
row:  7036
row:  7037
row:  7038
row:  7039
row:  7040
Invalid URL/ Timeout
row:  7041
Invalid Response
row:  7042
Invalid Response
row:  7043
row:  7044
row:  7045
row:  7046
row:  7047
row:  7048
row:  7049
row:  7050
row:  7051
row:  7052
row:  7053
row:  7054
row:  7055
row:  7056
row:  7057
row:  7058
row:  7059
row:  7060
row:  7061
row:  7062
row:  7063
row:  7064
row:  7065
row:  7066
row:  7067
row:  7068
row:  7069
row:  7070
row:  7071
row:  7072
row:  7073
Invalid URL/ Timeout
row:  7074
Invalid URL/ Timeout
row:  7075
Invalid URL/ Timeo

ERROR:urllib3.connection:Certificate did not match expected hostname: hyundaiusa.com. Certificate: {'subject': ((('businessCategory', 'Private Organization'),), (('jurisdictionCountryName', 'US'),), (('jurisdictionStateOrProvinceName', 'California'),), (('serialNumber', 'C1274114'),), (('countryName', 'US'),), (('stateOrProvinceName', 'California'),), (('localityName', 'Fountain Valley'),), (('organizationName', 'Hyundai Motor America'),), (('organizationalUnitName', 'IT'),), (('commonName', 'www.hyundaiusa.com'),)), 'issuer': ((('countryName', 'US'),), (('organizationName', 'DigiCert Inc'),), (('organizationalUnitName', 'www.digicert.com'),), (('commonName', 'DigiCert SHA2 Extended Validation Server CA'),)), 'version': 3, 'serialNumber': '0B6342BBED6A2AB59AF2B30634EED4FF', 'notBefore': 'Jun 28 00:00:00 2018 GMT', 'notAfter': 'Jul  5 12:00:00 2019 GMT', 'subjectAltName': (('DNS', 'www.hyundaiusa.com'),), 'OCSP': ('http://ocsp.digicert.com',), 'caIssuers': ('http://cacerts.digicert.com/

Invalid URL/ Timeout
row:  7253
row:  7254
Invalid Response
row:  7255
Invalid URL/ Timeout
row:  7256
row:  7257
row:  7258
row:  7259
row:  7260
row:  7261
row:  7262
row:  7263
row:  7264
row:  7265
row:  7266
Invalid URL/ Timeout
row:  7267
row:  7268
Invalid URL/ Timeout
row:  7269
row:  7270
row:  7271
row:  7272
row:  7273
row:  7274
row:  7275
row:  7276
row:  7277
row:  7278
row:  7279
Invalid URL/ Timeout
row:  7280
row:  7281
row:  7282
row:  7283
row:  7284
row:  7285
row:  7286
row:  7287
row:  7288
row:  7289
row:  7290
row:  7291
row:  7292
row:  7293
row:  7294
row:  7295
row:  7296
row:  7297
row:  7298
row:  7299
Time taken for 100 rows: 451.48802638053894
row:  7300
row:  7301
row:  7302
row:  7303
row:  7304
row:  7305
Invalid Response
row:  7306
row:  7307
Invalid URL/ Timeout
row:  7308
row:  7309
row:  7310
row:  7311
row:  7312


In [5]:
# data.to_csv('data_with_lang_labels.csv', index=False)
data = pd.read_csv('data_with_lang_labels.csv')
data.iloc[7300:]

Unnamed: 0,business_name,google1_correct_website,business_category,site-content,site-language
7300,Prada Fashion,http://prada.com/,Retail,PRADA - OFFICIAL WEBSITE _.Resort 2020 Womensw...,en
7301,Roland Schuhe - Germany,http://roland-schuhe.de/,Retail,"ROLAND Onlineshop – Schuhe, Taschen und Access...",de
7302,Samsung - United Kingdom,http://samsung.com/uk/,Retail,Samsung UK | Mobile | Home Electronics | Home ...,en
7303,SBB - Germany,http://sbb.ch/,Travel,Alles für Ihre Mobilität | SBB Navigieren auf ...,de
7304,Seventh Generation,http://seventhgeneration.com/,CPG,Homepage | Seventh Generation Skip to main con...,en
7305,Soleil Sucr? - France,http://soleilsucre.com/,Retail,,
7306,Sueddeutsche Zeitung GmbH - Germany,http://sueddeutsche.de/,Other,"Nachrichten aus Politik, Kultur, Wirtschaft un...",de
7307,TBG Thermenzentrum Geinberg BetriebsgmbH - Aus...,http://therme-geinberg.at/de/,Travel,,
7308,Tons of Tiles - United Kingdom,http://tonsoftiles.co.uk/,Retail,Bathroom & Kitchen Tiles | Wall & Floor Decor ...,en
7309,ufxmarkets - France,http://ufxmarkets.com/,Financial Services,UFX.com - Where Trading Makes Sense. #1 CFD On...,en


In [6]:
data_sub = data.dropna()

In [7]:
data_sub.shape

(5681, 5)

### Language distribution

In [15]:
data_sub['site-language'].value_counts()

en       4144
de        842
es        193
fr        172
it        116
pt         67
pl         31
tr         16
ro         10
nl         10
hu          9
ca          9
ja          8
tl          8
fi          5
da          5
ru          4
ko          4
cy          3
mr          3
cs          3
id          3
no          2
hr          2
hi          2
zh-cn       2
sk          2
lt          1
vi          1
et          1
th          1
sv          1
af          1
Name: site-language, dtype: int64

## Modeling on English Data

In [9]:
website_stopwords = ["skip", "about", "menu", "contact", "cookies", "login"]

In [9]:
data_en = data_sub[data_sub['site-language']=='en']

In [21]:
data_en["site-content"] = data_en["site-content"].apply(lambda x: re.sub('[^a-zA-Z ]', " ", x))

A value is trying to be set on a copy of a slice from a DataFrame.
Try using .loc[row_indexer,col_indexer] = value instead

See the caveats in the documentation: http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
  """Entry point for launching an IPython kernel.


In [6]:
# data_en.to_csv('data_english_sites.csv',index=False)

data_en = pd.read_csv('data_english_sites.csv')

In [7]:
data_en.head()

Unnamed: 0,business_name,google1_correct_website,business_category,site-content,site-language
0,Central 1,http://central1.com/,Financial Services,Search for Join our team we re growing VIEW...,en
1,ad tech | iMedia,http://ad-tech.com/,Other,Skip to main content ad tech Australia Sydney ...,en
2,Adotmob,http://adotmob.com/,Other,MENU MENU Home Know how Data Technologies Solu...,en
3,BBC Advertising,http://advertising.bbcworldwide.com/,Other,Toggle navigation SPECS SPECS CONTACT SPECS CO...,en
4,airG,http://airg.com/,Other,Have we met before WELCOME TO AIRG Login Sign...,en


In [10]:
# nltk.download('stopwords')
stop_words = nltk.corpus.stopwords.words('english')
stop_words.extend(website_stopwords)

In [11]:
tfidf = TfidfVectorizer(stop_words=stop_words)
tfidf_vector = tfidf.fit_transform(data_en["site-content"])

In [12]:
with open('tfidf_en.pkl', 'wb') as f:
    pickle.dump(tfidf, f)

In [14]:
with open('tfidf_en.pkl', 'rb') as f:
    tfidf = pickle.load(f)



In [18]:
# tfidf_vector = tfidf.transform(data_en["site-content"])

In [19]:
print(sorted(tfidf.vocabulary_.keys()))



In [13]:
X_tfidf = tfidf_vector.toarray()

In [14]:
X_tfidf.shape

(4144, 75271)

In [15]:
le = LabelEncoder()
y_tfidf = le.fit_transform(data_en['business_category'])

In [103]:
data_en['business_category'].value_counts()

Other                 2206
Retail                 830
Financial Services     368
CPG                    299
Travel                 292
Auto                   149
Name: business_category, dtype: int64

In [102]:
pd.Series(y_tfidf).value_counts()

3    2206
4     830
2     368
1     299
5     292
0     149
dtype: int64

In [33]:
data_en['business_category'].value_counts()/data_en.shape[0]

Other                 0.532336
Retail                0.200290
Financial Services    0.088803
CPG                   0.072153
Travel                0.070463
Auto                  0.035956
Name: business_category, dtype: float64

In [16]:
X_train, X_test, y_train, y_test = train_test_split(X_tfidf, y_tfidf, test_size=0.3, random_state=42)

In [17]:
X_train.shape, X_test.shape

((2900, 75271), (1244, 75271))

### Handling Imbalanced Class

In [18]:
from imblearn.over_sampling import RandomOverSampler
ros = RandomOverSampler(random_state=42)

In [19]:
X_ros_train, y_ros_train = ros.fit_sample(X_train, y_train)

### Logistic Regression

In [20]:
lr = LogisticRegression(random_state=42)
lr.fit(X_ros_train, y_ros_train)



LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
                   intercept_scaling=1, l1_ratio=None, max_iter=100,
                   multi_class='warn', n_jobs=None, penalty='l2',
                   random_state=42, solver='warn', tol=0.0001, verbose=0,
                   warm_start=False)

In [71]:
lr.score(X_train, y_train)

0.93

In [21]:
lr.score(X_test, y_test)

0.8014469453376206

In [22]:
with open('model_lr.pkl', 'wb') as f:
    pickle.dump(lr, f)

### Naive Bayes 

In [23]:
from sklearn.naive_bayes import MultinomialNB

In [24]:
nb = MultinomialNB()

In [25]:
nb.fit(X_ros_train, y_ros_train)

MultinomialNB(alpha=1.0, class_prior=None, fit_prior=True)

In [43]:
nb.score(X_train, y_train)

0.8644827586206897

In [26]:
nb.score(X_test, y_test)

0.7467845659163987

In [27]:
with open('model_naiveb.pkl', 'wb') as f:
    pickle.dump(nb, f)

### Linear SVM

In [113]:
from sklearn.svm import SVC

In [114]:
svc = SVC(kernel='linear', random_state=42)


In [115]:
svc.fit(X_ros_train, y_ros_train)

SVC(C=1.0, cache_size=200, class_weight=None, coef0=0.0,
    decision_function_shape='ovr', degree=3, gamma='auto_deprecated',
    kernel='linear', max_iter=-1, probability=False, random_state=42,
    shrinking=True, tol=0.001, verbose=False)

In [116]:
svc.score(X_test, y_test)

0.772508038585209

### Random Forest

In [28]:
from sklearn.ensemble import RandomForestClassifier

In [67]:
rf = RandomForestClassifier(n_estimators=30, max_depth=30, min_samples_leaf=10)

In [68]:
rf.fit(X_ros_train, y_ros_train)

RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
                       max_depth=30, max_features='auto', max_leaf_nodes=None,
                       min_impurity_decrease=0.0, min_impurity_split=None,
                       min_samples_leaf=10, min_samples_split=2,
                       min_weight_fraction_leaf=0.0, n_estimators=30,
                       n_jobs=None, oob_score=False, random_state=None,
                       verbose=0, warm_start=False)

In [69]:
rf.score(X_train, y_train)

0.8548275862068966

In [70]:
rf.score(X_test, y_test)

0.7508038585209004

In [72]:
with open('model_rf.pkl', 'wb') as f:
    pickle.dump(rf, f)

### VotingClassifier

In [73]:
from sklearn.ensemble import VotingClassifier

In [78]:
vc = VotingClassifier(estimators = [('lr',lr), ('nb',nb), ('rf',rf)])

In [79]:
vc.fit(X_ros_train, y_ros_train)



VotingClassifier(estimators=[('lr',
                              LogisticRegression(C=1.0, class_weight=None,
                                                 dual=False, fit_intercept=True,
                                                 intercept_scaling=1,
                                                 l1_ratio=None, max_iter=100,
                                                 multi_class='warn',
                                                 n_jobs=None, penalty='l2',
                                                 random_state=42, solver='warn',
                                                 tol=0.0001, verbose=0,
                                                 warm_start=False)),
                             ('nb',
                              MultinomialNB(alpha=1.0, class_prior=None,
                                            fit_prior=True)),
                             ('rf',
                              RandomFores...
                                         

In [80]:
vc.score(X_train,y_train)

0.9055172413793103

In [81]:
vc.score(X_test, y_test)

0.7877813504823151

In [84]:
from sklearn.metrics import classification_report
print(classification_report(y_test, vc.predict(X_test)))

              precision    recall  f1-score   support

           0       0.65      0.76      0.70        51
           1       0.59      0.74      0.66        95
           2       0.69      0.74      0.71        96
           3       0.84      0.86      0.85       673
           4       0.79      0.62      0.69       250
           5       0.82      0.81      0.82        79

    accuracy                           0.79      1244
   macro avg       0.73      0.76      0.74      1244
weighted avg       0.79      0.79      0.79      1244



In [85]:
with open('model_ensemble.pkl', 'wb') as f:
    pickle.dump(vc, f)

In [109]:
text = "Hello world. How do you do ? It's a pleasure to meet you ! We sell cars and bikes to our esteemed customers"

In [110]:
new_vector = tfidf.transform([text])

In [111]:
X_new = new_vector.toarray()

In [112]:
vc.predict(X_new)

array([0])

In [104]:
class_mapping = {0:"Auto", 1:"CPG", 2:"Financial Services", 3:"Other", 4:"Retail", 5:"Travel"}

## Lemmatized columns