In [1]:
import string
import pandas as pd
import nltk
from nltk.tokenize import word_tokenize
from nltk.probability import FreqDist

In [2]:
pd.set_option('display.max_rows', 200)

In [3]:
data = pd.read_csv('../data/descriptions.csv')

In [4]:
data.head()

Unnamed: 0.1,Unnamed: 0,Title,Description
0,0,IPhone 6+ 16GB gold TAKUU 12kk,Myydään erinomaisessa kunnossa oleva iPhone 6 ...
1,1,IPhone XS 64GB,"Hyväkuntoinen iPhone, laturi, kuulokkeet, suoj..."
2,2,Iphone 6s,"Akun kunto 88%, muistia muistaakseni 16. Panss..."
3,3,IPhone 8+ 64GB gold TAKUU 12kk,Myydään siistissä kunnossa oleva gold iPhone 8...
4,4,Apple iphone 8+ 64 gt space grey,Myydään kyseinen puhelin pois keräämästä pölyä...


In [5]:
data['Sentence'] = data['Title'] # + ' ' + data['Description']

In [6]:
data = data.drop(['Unnamed: 0', 'Title', 'Description'], axis=1)

In [7]:
data.head()

Unnamed: 0,Sentence
0,IPhone 6+ 16GB gold TAKUU 12kk
1,IPhone XS 64GB
2,Iphone 6s
3,IPhone 8+ 64GB gold TAKUU 12kk
4,Apple iphone 8+ 64 gt space grey


In [8]:
corpus = word_tokenize(' '.join(data.loc[:, 'Sentence']).lower())

In [9]:
len(corpus)

10948

In [10]:
corpus[:15]

['iphone',
 '6+',
 '16gb',
 'gold',
 'takuu',
 '12kk',
 'iphone',
 'xs',
 '64gb',
 'iphone',
 '6s',
 'iphone',
 '8+',
 '64gb',
 'gold']

Remove stopwords

In [11]:
sw = nltk.corpus.stopwords.words('finnish')

In [12]:
len(sw)

235

In [13]:
corpus = [t for t in corpus if t not in sw]

Remove punctuation

In [14]:
corpus = [t for t in corpus if t not in string.punctuation]

In [15]:
len(corpus)

10129

In [16]:
fd = FreqDist(corpus)

In [17]:
fd.most_common(10)

[('iphone', 1097),
 ('samsung', 360),
 ('64gb', 318),
 ('nokia', 251),
 ('7', 239),
 ('galaxy', 225),
 ('8', 210),
 ('huawei', 178),
 ('pro', 170),
 ('32gb', 153)]

In [18]:
common_uni = pd.DataFrame({
    'Word': [w[0] for w in fd.most_common(200)],
    'Count': [w[1] for w in fd.most_common(200)],
})

In [19]:
common_uni

Unnamed: 0,Word,Count
0,iphone,1097
1,samsung,360
2,64gb,318
3,nokia,251
4,7,239
5,galaxy,225
6,8,210
7,huawei,178
8,pro,170
9,32gb,153


In [20]:
from nltk import bigrams, trigrams

In [21]:
bi_fd = FreqDist(list(bigrams(corpus)))

In [22]:
# Total count 4404
common_bi = pd.DataFrame({
    'Word': [' '.join(w[0]) for w in bi_fd.most_common(200)],
    'Count': [w[1] for w in bi_fd.most_common(200)]}
)

In [23]:
common_bi

Unnamed: 0,Word,Count
0,samsung galaxy,200
1,iphone 7,183
2,iphone 8,148
3,iphone 6s,117
4,takuu 12kk,107
5,iphone xs,98
6,apple iphone,94
7,iphone x,91
8,iphone 6,65
9,iphone 11,62


In [24]:
tri_fd = FreqDist(list(trigrams(corpus)))

In [25]:
# Total count 7192
common_tri = pd.DataFrame({
    'Word': [' '.join(w[0]) for w in tri_fd.most_common(200)],
    'Count': [w[1] for w in tri_fd.most_common(200)],
})

In [26]:
common_tri

Unnamed: 0,Word,Count
0,iphone 8 plus,54
1,iphone 7 32gb,44
2,iphone 8 64gb,42
3,iphone x 64gb,41
4,takuu 12kk iphone,40
5,iphone 7 plus,40
6,iphone xs max,39
7,iphone 11 pro,39
8,8 plus 64gb,31
9,iphone 6s 32gb,30


In [27]:
tags = pd.concat([common_uni, common_bi, common_tri])

In [28]:
tags

Unnamed: 0,Word,Count
0,iphone,1097
1,samsung,360
2,64gb,318
3,nokia,251
4,7,239
...,...,...
195,max 64gb gold,5
196,32gb space gray,5
197,iphone 6 64gb,5
198,10 lite iphone,5


In [29]:
tags.to_csv('../data/tags.csv')