In [5]:
from faker import Faker
from datetime import date
import pandas as pd
import numpy as np

In [None]:
fake = Faker()

# First company

In [None]:
n_users = 1000

In [None]:
user_names = [fake.name() for i in range(n_users)]
user_addresses = [fake.address() for i in range(n_users)]
user_reg_dates = [fake.date_between_dates(date(2015,1,1), date(2017,10,14)) for i in range(n_users)]

In [None]:
users = pd.DataFrame({
        'name': user_names,
        'address': user_addresses,
        'registration_date': user_reg_dates
    })\
    .sort_values('registration_date')\
    .reset_index(drop=True)\
    .reset_index()\
    .rename(columns = {'index': 'id'})

In [None]:
users.loc[0, 'registration_date']

In [None]:
order_user_ids = np.random.choice(range(n_users), size = n_users * 20)

In [None]:
order_dates = [fake.date_between_dates(users.loc[oui, 'registration_date'], date(2017,10,14)) for oui in order_user_ids]

In [None]:
order_costs = [(np.random.randn() + 1) * 1000 for i in range(len(order_user_ids))]

In [None]:
order_lats = [fake.latitude() for i in range(len(order_user_ids))]
order_lons = [fake.longitude() for i in range(len(order_user_ids))]

In [None]:
orders = pd.DataFrame({
        'date': order_dates,
        'user_id': order_user_ids,
        'lat': order_lats,
        'lon': order_lons,
        'cost': order_costs
    })\
    .sort_values('date')\
    .reset_index(drop=True)\
    .reset_index()\
    .rename(columns = {'index': 'id'})

In [None]:
users.to_csv('data/0_users.csv', index=False)
orders.to_csv('data/0_orders.csv', index=False)

# Parse dates

In [None]:
from dateparser import parse

In [None]:
query = """
How many users registered two days ago
"""

In [None]:
def parse_dates(query):

    words = query.split()

    words_parsed = [0] * len(words)
    for i in range(len(words)):
        for j in range(i, len(words)):
            candidate = ' '.join(words[i:j+1])
            if parse(candidate):
                words_parsed[i:j+1] = [1]*(j+1-i)

    for i in range(len(words)):
        if words[i] in ['and', 'to']:
            words_parsed[i] = 0
                
    parsed_dates = []
    current_date = ''
    for i in range(len(words)+1):
        if i == len(words) or words_parsed[i] == 0:
            if current_date:
                parsed_dates.append(parse(current_date).date())
            current_date = ''
        else:
            current_date = current_date + (' ' if current_date else '')  + words[i]

    parsed_dates = sorted(parsed_dates)
    dates_to_return = []
    if len(parsed_dates) > 0:
        dates_to_return.append(parsed_dates[0])
    if len(parsed_dates) > 1:
        dates_to_return.append(parsed_dates[len(parsed_dates) - 1])
    
    return parsed_dates

In [None]:
q = 'How many users registered last month?'
parse_dates(q)

In [None]:
q = 'How many users registered in the last 7 days?'
parse_dates(q)

In [None]:
q = 'How many users registered from 01/09/2017 to 01/10/2017'
parse_dates(q)

# Comments

In [None]:
c_data = pd.read_csv('data_for_comments.csv', encoding='cp1251')

In [None]:
comments = c_data['content']

In [None]:
l = len(comments)

In [None]:
comments.index = np.random.randint(0, 20000, size = l)

In [None]:
comments = comments.to_frame().reset_index().rename(columns = {'index': 'id', 'content': 'comment'})

In [None]:
orders = orders.merge(comments, how = 'left', on  = 'id')

In [None]:
orders.to_csv('data/0_orders.csv')

In [None]:
import requests
def get_sentiment(text, lang = 'en'):
    """
    Grabs sentiment (1/0 - good/bad) from go microservice
    Langs: en, es
    """
    url = 'https://plentsov.com/sentiment/analyze?query={}&lang={}'.format(text, lang)
    try:
        sentiment = int(requests.get(url).json()['result'])
        sentiment = 'good' if sentiment else 'bad'
    except:
        sentiment = 'neutral'
    return sentiment

In [None]:
from tqdm import tqdm_notebook

In [None]:
tqdm_notebook().pandas()

In [None]:
pd.isnull(orders.comment[0])

In [None]:
orders['sentiment'] = orders.comment.progress_apply(lambda c: c if pd.isnull(c) else get_sentiment(c)) 

In [None]:
orders.to_csv('data/0_orders.csv')

# Sentiment prediction

In [None]:
orders = pd.read_csv('data/0_orders.csv')

In [None]:
orders = orders.loc[orders.sentiment.isin(['good', 'bad']), :].copy()

In [None]:
y = orders['sentiment'].values

In [None]:
y = (y == 'bad') * 1

In [None]:
y

In [None]:
X = orders[['cost', 'lat', 'lon', 'date', 'user_id']].copy()

In [None]:
from catboost import CatBoostClassifier

In [None]:
mdl = CatBoostClassifier()

In [None]:
cat_features = ['date', 'user_id']

In [None]:
X['cost'].dtype == np.dtype('int')

In [None]:
cat_features = [list(X.columns).index(f) for f in cat_features]

In [None]:
mdl.fit(X, y, cat_features=cat_features)

In [None]:
np.array(X.columns)[np.argsort(mdl.feature_importance_)[-3:]]

In [None]:
dict(zip(list(X.columns), [round(fi, 2) for fi in mdl.feature_importance_]))

In [None]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from datetime import datetime

def get_ml(X, y):
	"""
	Provides with machine learning magic
	"""
	start = datetime.now()

	response = 'Using CatBoostClassifier**\n'

	response += 'Categorical features recognized:\n'
	cat_features = []
	for f in list(X.columns):
		if not X[f].dtype in [np.dtype('int'), np.dtype('float64')]:
			cat_features.append(f)
			response += ('* ' + f + '\n')
	response += '\n'

	cat_features = [list(X.columns).index(f) for f in cat_features]

	mdl.fit(X, y, cat_features=cat_features)
	response += 'Accuracy of the model: {}.\n\n'.format(mdl.score(X, y))

	response += 'Most important features:\n'
	n_features = 3
	imp_f = np.array(X.columns)[np.argsort(mdl.feature_importance_)[-n_features:]][::-1]
	imp_fi = np.round(np.sort(mdl.feature_importance_)[-n_features:],2)[::-1]
	for i in range(n_features):
		response += '* {}: {}\n'.format(imp_f[i], imp_fi[i])

	elapsed = (datetime.now() - start).seconds
	response += '\n'
	response += 'Time elapsed: {} seconds'.format(elapsed)

	return response

In [None]:
get_ml(X, y)

In [6]:
PATH_TO_SCRIPT = '/Users/plentsov/google_drive/upc2017/'
companies_data = {
	28554: {
		'users': pd.read_csv(PATH_TO_SCRIPT + 'data/0_users.csv', parse_dates = ['registration_date']),
		'orders': pd.read_csv(PATH_TO_SCRIPT + 'data/0_orders.csv', parse_dates = ['date'])
	},
	28565: {
		'users': pd.read_csv(PATH_TO_SCRIPT + 'data/0_users.csv', parse_dates = ['registration_date']),
		'orders': pd.read_csv(PATH_TO_SCRIPT + 'data/0_orders.csv', parse_dates = ['date'])
	},
	28656: {
		'users': pd.read_csv(PATH_TO_SCRIPT + 'data/0_users.csv', parse_dates = ['registration_date']),
		'orders': pd.read_csv(PATH_TO_SCRIPT + 'data/0_orders.csv', parse_dates = ['date'])
	}
}

In [7]:
orders = companies_data[28656]['orders'].copy()
data = orders.loc[orders.sentiment.isin(['good', 'bad']), :].copy()
y = data['sentiment'].values
y = (y == 'bad') * 1
data['date'] = data['date'].astype(str)
X = data[['cost', 'lat', 'lon', 'date', 'user_id']].copy()
#data = oscar_ml.get_ml(X, y)

In [8]:
import oscar_ml

In [9]:
oscar_ml.get_ml(X, y)

AttributeError: 'CatBoostClassifier' object has no attribute 'feature_importance_'

In [11]:
import numpy as np
import pandas as pd
from catboost import CatBoostClassifier
from datetime import datetime

In [14]:


"""
Provides with machine learning magic
"""
start = datetime.now()

response = 'Using CatBoostClassifier**\n'

response += 'Categorical features recognized:\n'
cat_features = []
for f in list(X.columns):
    if not X[f].dtype in [np.dtype('int'), np.dtype('float64')]:
        cat_features.append(f)
        response += ('* ' + f + '\n')
response += '\n'

cat_features = [list(X.columns).index(f) for f in cat_features]

mdl = CatBoostClassifier()
mdl.fit(np.array(X), y, cat_features=cat_features)
response += 'Accuracy of the model: {}.\n\n'.format(round((mdl.predict(np.array(X)) == y).mean(), 2))

response += 'Most important features:\n'
n_features = 3
imp_f = np.array(X.columns)[np.argsort(mdl.feature_importances_)[-n_features:]][::-1]
imp_fi = np.round(np.sort(mdl.feature_importances_)[-n_features:],2)[::-1]
for i in range(n_features):
    response += '* {}: {}\n'.format(imp_f[i], imp_fi[i])

elapsed = (datetime.now() - start).seconds
response += '\n'
response += 'Time elapsed: {} seconds'.format(elapsed)

return response

SyntaxError: 'return' outside function (<ipython-input-14-3c0b071625c4>, line 35)

In [16]:
from oscar_ml import get_ml

In [17]:
get_ml(X, y)

AttributeError: 'CatBoostClassifier' object has no attribute 'feature_importance_'