## Dataset Cleaning: Company Slogans
----

In this project, I clean a dataset consisting of various company slogans so that it may be used for other purposes. 

In [1]:
## import packages
import pandas as pd 
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize

In [2]:
## dataset url = https://github.com/ChaiBapchya/slogans/blob/master/sloganlist.csv

In [3]:
## initialize dataset
df_slogans = pd.read_csv('slogans.csv')

In [4]:
## get shape
df_slogans.shape

(1162, 2)

In [5]:
## get info
df_slogans.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1162 entries, 0 to 1161
Data columns (total 2 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   Company  1162 non-null   object
 1   Slogan   1162 non-null   object
dtypes: object(2)
memory usage: 18.3+ KB


In [6]:
## get first five rows
df_slogans.head()

Unnamed: 0,Company,Slogan
0,Costa Coffee,For coffee lovers.
1,Evian,Evian. Live young.
2,Dasani,Designed to make a difference.
3,Heineken,It's all about the beer.
4,Gatorade,The Legend Continues.


In [7]:
## check for missing values
df_slogans.isna().sum()

Company    0
Slogan     0
dtype: int64

In [8]:
## ascii characters 
df_slogans['Slogan'] = df_slogans['Slogan'].str.encode('ascii', 'ignore').str.decode('ascii')

In [9]:
## convert to lowercase and remove lead space
df_slogans['Slogan'] = df_slogans['Slogan'].str.lower()
df_slogans['Slogan'] = df_slogans['Slogan'].str.strip()

df_slogans['Company'] = df_slogans['Company'].str.lower()
df_slogans['Company'] = df_slogans['Company'].str.strip()

In [10]:
def no_punctuation(slogan):
    try:
        slogan = slogan.str.replace('[^\w\s]','')
    except:
        pass
    return slogan

df_slogans.apply(no_punctuation)

Unnamed: 0,Company,Slogan
0,costa coffee,for coffee lovers
1,evian,evian live young
2,dasani,designed to make a difference
3,heineken,its all about the beer
4,gatorade,the legend continues
...,...,...
1157,levis jeans,original jeans original people
1158,dove toiletries,the secret of beautiful hair
1159,lipton,be more tea
1160,pampers,discover your babys world


In [11]:
## remove stopwords
stop = stopwords.words('english')

df_slogans['Slogan'] = df_slogans['Slogan'].apply(
    lambda x: ' '.join([word for word in x.split() if word not in (stop)]))

In [12]:
## tokenize the data
df_slogans['Slogan'] = df_slogans['Slogan'].apply(word_tokenize)

df_slogans['Slogan']

0                             [coffee, lovers, .]
1                      [evian, ., live, young, .]
2                 [designed, make, difference, .]
3                                       [beer, .]
4                          [legend, continues, .]
                          ...                    
1157    [original, jeans, ., original, people, .]
1158                 [secret, beautiful, hair, .]
1159                                     [tea, .]
1160                  [discover, babys, world, .]
1161                          [taste, feeling, .]
Name: Slogan, Length: 1162, dtype: object