In [1]:
from faker import Faker
from datetime import date
import pandas as pd
import numpy as np

In [3]:
fake = Faker()

# First company

In [28]:
n_users = 1000

In [31]:
user_names = [fake.name() for i in range(n_users)]
user_addresses = [fake.address() for i in range(n_users)]
user_reg_dates = [fake.date_between_dates(date(2015,1,1), date(2017,10,14)) for i in range(n_users)]

In [39]:
users = pd.DataFrame({
        'name': user_names,
        'address': user_addresses,
        'registration_date': user_reg_dates
    })\
    .sort_values('registration_date')\
    .reset_index(drop=True)\
    .reset_index()\
    .rename(columns = {'index': 'id'})

In [46]:
users.loc[0, 'registration_date']

datetime.date(2015, 1, 1)

In [44]:
order_user_ids = np.random.choice(range(n_users), size = n_users * 20)

In [48]:
order_dates = [fake.date_between_dates(users.loc[oui, 'registration_date'], date(2017,10,14)) for oui in order_user_ids]

In [50]:
order_costs = [(np.random.randn() + 1) * 1000 for i in range(len(order_user_ids))]

In [52]:
order_lats = [fake.latitude() for i in range(len(order_user_ids))]
order_lons = [fake.longitude() for i in range(len(order_user_ids))]

In [56]:
orders = pd.DataFrame({
        'date': order_dates,
        'user_id': order_user_ids,
        'lat': order_lats,
        'lon': order_lons,
        'cost': order_costs
    })\
    .sort_values('date')\
    .reset_index(drop=True)\
    .reset_index()\
    .rename(columns = {'index': 'id'})

In [61]:
users.to_csv('data/0_users.csv', index=False)
orders.to_csv('data/0_orders.csv', index=False)

# Parse dates

In [100]:
from dateparser import parse

In [121]:
query = """
How many users registered two days ago
"""

In [192]:
def parse_dates(query):

    words = query.split()

    words_parsed = [0] * len(words)
    for i in range(len(words)):
        for j in range(i, len(words)):
            candidate = ' '.join(words[i:j+1])
            if parse(candidate):
                words_parsed[i:j+1] = [1]*(j+1-i)

    for i in range(len(words)):
        if words[i] in ['and', 'to']:
            words_parsed[i] = 0
                
    parsed_dates = []
    current_date = ''
    for i in range(len(words)+1):
        if i == len(words) or words_parsed[i] == 0:
            if current_date:
                parsed_dates.append(parse(current_date).date())
            current_date = ''
        else:
            current_date = current_date + (' ' if current_date else '')  + words[i]

    parsed_dates = sorted(parsed_dates)
    dates_to_return = []
    if len(parsed_dates) > 0:
        dates_to_return.append(parsed_dates[0])
    if len(parsed_dates) > 1:
        dates_to_return.append(parsed_dates[len(parsed_dates) - 1])
    
    return parsed_dates

In [184]:
q = 'How many users registered last month?'
parse_dates(q)

last month? 2017-09-14 04:11:36.407281


[datetime.date(2017, 9, 14)]

In [185]:
q = 'How many users registered in the last 7 days?'
parse_dates(q)

7 days? 2017-10-07 04:11:48.138956


[datetime.date(2017, 10, 7)]

In [195]:
q = 'How many users registered from 01/09/2017 to 01/10/2017'
parse_dates(q)

[datetime.date(2017, 9, 1), datetime.date(2017, 10, 1)]

# Comments

In [211]:
c_data = pd.read_csv('data_for_comments.csv', encoding='cp1251')

In [214]:
comments = c_data['content']

In [216]:
l = len(comments)

In [217]:
comments.index = np.random.randint(0, 20000, size = l)

In [223]:
comments = comments.to_frame().reset_index().rename(columns = {'index': 'id', 'content': 'comment'})

In [227]:
orders = orders.merge(comments, how = 'left', on  = 'id')

In [228]:
orders.to_csv('data/0_orders.csv')

In [246]:
import requests
def get_sentiment(text, lang = 'en'):
    """
    Grabs sentiment (1/0 - good/bad) from go microservice
    Langs: en, es
    """
    url = 'https://plentsov.com/sentiment/analyze?query={}&lang={}'.format(text, lang)
    try:
        sentiment = int(requests.get(url).json()['result'])
        sentiment = 'good' if sentiment else 'bad'
    except:
        sentiment = 'neutral'
    return sentiment

In [233]:
from tqdm import tqdm_notebook

In [235]:
tqdm_notebook().pandas()




In [240]:
pd.isnull(orders.comment[0])

True

In [247]:
orders['sentiment'] = orders.comment.progress_apply(lambda c: c if pd.isnull(c) else get_sentiment(c)) 

In [249]:
orders.to_csv('data/0_orders.csv')

# Sentiment prediction

In [4]:
orders = pd.read_csv('data/0_orders.csv')

In [5]:
orders = orders.loc[orders.sentiment.isin(['good', 'bad']), :].copy()

In [15]:
y = orders['sentiment'].values

In [16]:
y = (y == 'bad') * 1

In [17]:
y

array([1, 0, 1, ..., 1, 1, 0])

In [19]:
X = orders[['cost', 'lat', 'lon', 'date', 'user_id']].copy()

In [21]:
from catboost import CatBoostClassifier

In [22]:
mdl = CatBoostClassifier()

In [24]:
cat_features = ['date', 'user_id']

In [45]:
X['cost'].dtype == np.dtype('int')

False

In [25]:
cat_features = [list(X.columns).index(f) for f in cat_features]

In [26]:
mdl.fit(X, y, cat_features=cat_features)

<catboost.core.CatBoostClassifier at 0x11660a400>

In [53]:
np.array(X.columns)[np.argsort(mdl.feature_importance_)[-3:]]

array(['lon', 'date', 'lat'], dtype=object)

In [55]:
dict(zip(list(X.columns), [round(fi, 2) for fi in mdl.feature_importance_]))

{'cost': 18.44, 'date': 21.46, 'lat': 21.64, 'lon': 21.42, 'user_id': 17.03}

array(['lat', 'date', 'lon'], dtype=object)

In [69]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from datetime import datetime

def get_ml(X, y):
	"""
	Provides with machine learning magic
	"""
	start = datetime.now()

	response = 'Using CatBoostClassifier**\n'

	response += 'Categorical features recognized:\n'
	cat_features = []
	for f in list(X.columns):
		if not X[f].dtype in [np.dtype('int'), np.dtype('float64')]:
			cat_features.append(f)
			response += ('* ' + f + '\n')
	response += '\n'

	cat_features = [list(X.columns).index(f) for f in cat_features]

	mdl.fit(X, y, cat_features=cat_features)
	response += 'Accuracy of the model: {}.\n\n'.format(mdl.score(X, y))

	response += 'Most important features:\n'
	n_features = 3
	imp_f = np.array(X.columns)[np.argsort(mdl.feature_importance_)[-n_features:]][::-1]
	imp_fi = np.round(np.sort(mdl.feature_importance_)[-n_features:],2)[::-1]
	for i in range(n_features):
		response += '* {}: {}\n'.format(imp_f[i], imp_fi[i])

	elapsed = (datetime.now() - start).seconds
	response += '\n'
	response += 'Time elapsed: {} seconds'.format(elapsed)

	return response

In [70]:
get_ml(X, y)

'Using CatBoostClassifier**\nCategorical features recognized:\n* date\n\nAccuracy of the model: 0.6624.\n\nMost important features:\n* lon: 21.79\n* date: 20.51\n* user_id: 20.0\n\nTime elapsed: 9 seconds'

In [71]:
start = datetime.now()

response = 'Using CatBoostClassifier**\n'

response += 'Categorical features recognized:\n'
cat_features = []
for f in list(X.columns):
    if not X[f].dtype in [np.dtype('int'), np.dtype('float64')]:
        cat_features.append(f)
        response += ('* ' + f + '\n')
response += '\n'

cat_features = [list(X.columns).index(f) for f in cat_features]

mdl = CatBoostClassifier()
mdl.fit(X, y, cat_features=cat_features)
response += 'Accuracy of the model: {}.\n\n'.format(mdl.score(X, y))

response += 'Most important features:\n'
n_features = 3
imp_f = np.array(X.columns)[np.argsort(mdl.feature_importance_)[-n_features:]][::-1]
imp_fi = np.round(np.sort(mdl.feature_importance_)[-n_features:],2)[::-1]
for i in range(n_features):
    response += '* {}: {}\n'.format(imp_f[i], imp_fi[i])

elapsed = (datetime.now() - start).seconds
response += '\n'
response += 'Time elapsed: {} seconds'.format(elapsed)