In [29]:
import os
import pickle
from os.path import join

import pandas as pd
from matplotlib import pyplot as plt
pd.set_option('display.max_rows', 1000)
pd.set_option('display.max_columns', 1000)

import nltk
from nltk.corpus import stopwords
from nltk.tokenize import RegexpTokenizer
from nltk.stem import PorterStemmer
nltk.download('stopwords')
nltk.download('averaged_perceptron_tagger')

import scipy as sp
import numpy as np
from sklearn.feature_extraction.text import CountVectorizer

import warnings
warnings.filterwarnings('ignore')

[nltk_data] Downloading package stopwords to /home/maaxap/nltk_data...
[nltk_data]   Package stopwords is already up-to-date!
[nltk_data] Downloading package averaged_perceptron_tagger to
[nltk_data]     /home/maaxap/nltk_data...
[nltk_data]   Package averaged_perceptron_tagger is already up-to-
[nltk_data]       date!


In [2]:
!ls /home/maaxap/workspace/data/menuby

 data.csv		 MenuItemFixed.txt	  Shifts.txt
'Data Description.pdf'	 OrderEntry.txt		 'Задание для дататона.docx'
 Data.zip		 OrderItems.txt
 MenuItem.csv		 RestaurantCategory.txt


In [3]:
data_root = '/home/maaxap/workspace/data/menuby'
data_filepath = join(data_root, 'data.csv')

data = pd.read_csv(data_filepath)
data = data.dropna()

In [4]:
data.columns = [
    'user_id', 'order_id', 'item_id', 
    'count', 'description'
]
data = data[[
    'user_id', 'order_id', 'item_id', 
    'description', 'count'
]]

In [5]:
data.head(10)

Unnamed: 0,user_id,order_id,item_id,description,count
0,59960,104376,16489,Classic kebab with lamb 380g lavash lamb pi...,2.0
1,59968,104377,2320,Whopper with cheese Set whopper with cheese v...,1.0
2,54779,104378,10745,Falafel 350g,1.0
3,54779,104378,12098,Vegetarian shawarma 400g cheese mushrooms t...,1.0
4,30143,104379,18732,Kagatsu maki 8pcs 116g rice nori cheese Ph...,2.0
5,30143,104379,18726,Asahi maki 8pcs 116g rice nori crab sticks...,2.0
6,30143,104379,18729,Kappa maki 8pcs 111g rice nori cucumber,1.0
7,59971,104380,14172,King Burger 460g fried bun cheese fresh veg...,1.0
8,59971,104380,14147,Bread sticks 150g deep fried bread sticks in ...,1.0
9,59972,104381,4061,Pizza Lisitsa pizza sauce ham pepperoni cha...,2.0


In [6]:
tokenizer = RegexpTokenizer('[A-Za-z]+')
stemmer = PorterStemmer()
stop_words = set(stopwords.words('english'))

def process_text(text):
    text = text.lower()
    tokens = set(tokenizer.tokenize(text)) - stop_words
    tokens = filter(lambda x: len(x) > 3, tokens)
    # tokens = map(lambda x: stemmer.stem(x), tokens)
    return ' '.join(tokens)

In [7]:
%%time

data['description'] = data['description'].apply(process_text)

CPU times: user 19.2 s, sys: 235 ms, total: 19.4 s
Wall time: 19.4 s


In [8]:
data.head()

Unnamed: 0,user_id,order_id,item_id,description,count
0,59960,104376,16489,kebab cabbage pickled lavash tomato garlic lam...,2.0
1,59968,104377,2320,whopper large cheese size drink potatoes villa...,1.0
2,54779,104378,10745,falafel,1.0
3,54779,104378,12098,vegetarian shawarma pepper bulgarian french mu...,1.0
4,30143,104379,18732,philadelphia kagatsu nori rice omelette japane...,2.0


In [9]:
data['description'] = ((data['description'] + ' ').astype(str) * 2).str.strip()

In [30]:
data.user_id = data.user_id.astype(np.int32)
data.order_id = data.order_id.astype(np.int32)
data.item_id = data.item_id.astype(np.int32)

In [37]:
data[['user_id', 'order_id', 'item_id']].to_csv(join(data_root, 'labels.csv'), index=False)

In [11]:
count_vectorizer = CountVectorizer()

In [12]:
count_vectorizer.fit(data['description'].values)

CountVectorizer(analyzer='word', binary=False, decode_error='strict',
                dtype=<class 'numpy.int64'>, encoding='utf-8', input='content',
                lowercase=True, max_df=1.0, max_features=None, min_df=1,
                ngram_range=(1, 1), preprocessor=None, stop_words=None,
                strip_accents=None, token_pattern='(?u)\\b\\w\\w+\\b',
                tokenizer=None, vocabulary=None)

In [13]:
X = count_vectorizer.transform(data['description'].values)

In [18]:
sp.sparse.save_npz(join(data_root, 'embedings.npz'), X)