In [1]:
import pandas as pd
import numpy as np
from sklearn.model_selection import RandomizedSearchCV
from sklearn.decomposition import LatentDirichletAllocation
from sklearn.feature_extraction.text import CountVectorizer
import seaborn as sns
import matplotlib.pyplot as plt
%matplotlib inline

In [2]:
data = pd.read_csv('../Data/cleaned_ads_data.csv')

In [3]:
data.head()

Unnamed: 0,brand,country,medium,headline,description,industry
0,Enable Foundation,,Ambient & Interactive,,surreal is real in dementia is an awareness ca...,Public interest & Non-profit
1,Tzabar,Israel,Print,roger waters live in paris 3 nights concert €...,,Transport & Tourism
2,Infiniti,United States,Print,accelerating the future,,Automotive & Services
3,Toyota,Canada,Print,official ride of the spinfest amateur dj awards,,Automotive & Services
4,Friends of the Earth,Germany,Print,travelling fruits cause pollution think global...,,Public interest & Non-profit


In [4]:
desc = data.dropna(subset=['description'])
len(desc)

17608

In [5]:
desc.head()

Unnamed: 0,brand,country,medium,headline,description,industry
0,Enable Foundation,,Ambient & Interactive,,surreal is real in dementia is an awareness ca...,Public interest & Non-profit
6,Fedrigoni,,Design & Branding,,pulp is a quarterly journal of people and pape...,Non-alcoholic drinks
7,Spontex,France,Cyber,,execution each time a page would post the pic...,"House, Garden & Pets"
8,Coca-Cola,Denmark,Promotion & Event,makes monday taste like friday,summary in 2006 the northern european market f...,Non-alcoholic drinks
9,DOE,United Kingdom,Print,no seatbelt no excuse,how a nineyear investment of â£868m in road sa...,Public interest & Non-profit


In [6]:
desc[desc.duplicated(subset=['description'])]

Unnamed: 0,brand,country,medium,headline,description,industry
1504,StreetEasy,United States,Outdoor,ive been hiding a saint berdoodle from my land...,find your place showcases all of the amenities...,Professional & Public services
1777,U.S. Census Bureau,United States,Print,its for our ohana its in our hands,the objective is to encourage native hawaiians...,Public interest & Non-profit
1847,New York Lottery,United States,Print,you shop the market rises yeah that kind of ri...,ddb new york has created a new typeface made o...,Toys & Games
1921,Virgin,Australia,Radio,,description synopsis this is a story about ...,Professional & Public services
2043,,,Print,find concerts in a new place,via adverbox,Media & Publishing
2074,HSL - Helsinki Region Transport,Finland,Print,throught the lens of will burrardlucas walking...,wildlife photographer will burrardlucas photog...,Public interest & Non-profit
2116,Australian Government,,Print,he stepped over it when he went into my email ...,the australian government is running the line ...,Public interest & Non-profit
2407,,Chile,Print,guarranteed satisfaction if youre not satisfie...,its a series of 10 print ads that speak about ...,Education
2451,Widow Jane,United States,Outdoor,crafted with pure mineral water from the same ...,to build a great city you start with the tough...,Alcoholic drinks
2738,City of Geneva,Switzerland,Print,throwing butts in a manhole easy getting it o...,the city of geneva is launching a campaign to ...,Public interest & Non-profit


In [7]:
desc.isnull().sum()

brand            880
country         2338
medium            72
headline       12941
description        0
industry           0
dtype: int64

In [8]:
desc.brand.value_counts()[:25]

Volkswagen                   242
Nike                         176
McDonald's                   151
Samsung                      121
IKEA                         120
Google                       113
Coca-Cola                    100
Honda                         88
adidas                        86
Amnesty International         84
Toyota                        84
Audi                          78
Heineken                      72
Mercedes-Benz                 72
Burger King                   72
Apple                         67
World Wildlife Fund (WWF)     60
Volvo                         57
Axe/Lynx                      55
Nissan                        52
BMW                           51
Renault                       51
Ford                          47
UNICEF                        46
Mars                          44
Name: brand, dtype: int64

In [9]:
desc.country.value_counts()[:25]

United States           3213
United Kingdom          1531
Brazil                   925
France                   911
Germany                  838
Canada                   654
Australia                554
Sweden                   413
India                    395
Spain                    340
South Africa             336
Netherlands              333
Japan                    283
Argentina                281
China                    273
Italy                    258
United Arab Emirates     231
New Zealand              222
Belgium                  216
Russia                   177
Colombia                 160
Singapore                144
Israel                   129
Switzerland              123
Portugal                 118
Name: country, dtype: int64

In [10]:
desc.medium.value_counts()[:25]

Print                    4445
TV & Cinema              4115
Cyber                    2542
Design & Branding        1643
Outdoor                  1429
Promotion & Event        1205
Ambient & Interactive    1000
Direct Market             736
Radio                     345
Miscellaneous              76
Name: medium, dtype: int64

In [11]:
desc.industry.nunique()

28

In [12]:
X = desc.description
cv = CountVectorizer(stop_words='english', ngram_range=(1,3), analyzer=
                    'word', max_df=0.75)
desc_cv = cv.fit_transform(X)

In [13]:
desc_cv.shape

(17608, 2487856)

In [14]:
lda = LatentDirichletAllocation()
params = {'n_components': range(5,26), 'learning_method':['batch', 'online'],
         'learning_decay':[.5,.6,.7,.8,.9,1]}
lda_rs = RandomizedSearchCV(lda, param_distributions=params, verbose=10, n_jobs=-1)
lda_rs.fit(desc_cv)

[Parallel(n_jobs=-1)]: Using backend LokyBackend with 8 concurrent workers.


Fitting 3 folds for each of 10 candidates, totalling 30 fits


[Parallel(n_jobs=-1)]: Done   2 tasks      | elapsed:  7.0min
[Parallel(n_jobs=-1)]: Done   9 tasks      | elapsed: 843.4min
[Parallel(n_jobs=-1)]: Done  19 out of  30 | elapsed: 866.9min remaining: 501.9min
[Parallel(n_jobs=-1)]: Done  23 out of  30 | elapsed: 871.6min remaining: 265.3min
[Parallel(n_jobs=-1)]: Done  27 out of  30 | elapsed: 885.8min remaining: 98.4min
[Parallel(n_jobs=-1)]: Done  30 out of  30 | elapsed: 895.0min finished


RandomizedSearchCV(cv='warn', error_score='raise-deprecating',
          estimator=LatentDirichletAllocation(batch_size=128, doc_topic_prior=None,
             evaluate_every=-1, learning_decay=0.7,
             learning_method='batch', learning_offset=10.0,
             max_doc_update_iter=100, max_iter=10, mean_change_tol=0.001,
             n_components=10, n_jobs=None, n_topics=None, perp_tol=0.1,
             random_state=None, topic_word_prior=None,
             total_samples=1000000.0, verbose=0),
          fit_params=None, iid='warn', n_iter=10, n_jobs=-1,
          param_distributions={'n_components': range(5, 26), 'learning_method': ['batch', 'online'], 'learning_decay': [0.5, 0.6, 0.7, 0.8, 0.9, 1]},
          pre_dispatch='2*n_jobs', random_state=None, refit=True,
          return_train_score='warn', scoring=None, verbose=10)

In [15]:
lda_rs.best_params_

{'n_components': 6, 'learning_method': 'online', 'learning_decay': 0.9}

In [16]:
lda_model = lda_rs.best_estimator_

In [17]:
lda_output = lda_model.fit_transform(desc_cv)

In [18]:
def show_topics(vectorizer, lda_model, n_words=10):
    keywords = np.array(vectorizer.get_feature_names())
    topic_keywords = []
    for topic_weights in lda_model.components_:
        top_keyword_locs = (-topic_weights).argsort()[:n_words]
        topic_keywords.append(keywords.take(top_keyword_locs))
    return topic_keywords

In [19]:
show_topics(cv, lda_model)

[array(['vr', 'heineken', 'plastic', 'google', 'ibm', 'als', 'fish',
        'bible', 'maps', 'god'], dtype='<U270'),
 array(['la', 'en', 'original', 'que', 'bag', 'le', 'el', 'et', 'nissan',
        'metal'], dtype='<U270'),
 array(['site', 'lego', 'bmw', 'location', 'pizza', 'dimensions',
        'veterans', 'ford', 'speed', 'war'], dtype='<U270'),
 array(['campaign', 'people', 'new', 'media', 'brand', 'world', 'social',
        'created', 'time', 'way'], dtype='<U270'),
 array(['new', 'film', 'spot', 'creative', 'director', 'game', 'team',
        'campaign', 'music', 'work'], dtype='<U270'),
 array(['paper', 'barbie', 'fuel', 'design', 'volkswagen', 'union',
        'dementia', 'amnesty', 'japanese', 'visual'], dtype='<U270')]