In [88]:
# !pip install pandasql

import numpy as np
import pandas as pd
import pandasql as ps

import warnings
warnings.filterwarnings('ignore')

Схема БД состоит из четырех таблиц:

**Product(maker, model, type)**

**PC(code, model, speed, ram, hd, cd, price)**

**Laptop(code, model, speed, ram, hd, price, screen)**

**Printer(code, model, color, type, price)**

Таблица Product представляет производителя (maker), номер модели (model) и тип ('PC' - ПК, 'Laptop' -  ноутбук или 'Printer' - принтер). Предполагается, что номера моделей в таблице Product уникальны для всех производителей и типов продуктов. В таблице PC для каждого ПК, однозначно определяемого уникальным кодом – code, указаны модель – model (внешний ключ к таблице Product), скорость - speed (процессора в мегагерцах), объем памяти - ram (в мегабайтах), размер диска - hd (в гигабайтах), скорость считывающего устройства - cd (например, '4x') и цена - price. Таблица Laptop аналогична таблице РС за исключением того, что вместо скорости CD содержит размер экрана -screen (в дюймах). В таблице Printer для каждой модели принтера указывается, является ли он цветным - color ('y', если цветной), тип принтера - type (лазерный – 'Laser', струйный – 'Jet' или матричный – 'Matrix') и цена - price.
 
 ![](computers.png)

In [248]:
product = pd.read_csv('data/product.csv', index_col='model')
product.head()

Unnamed: 0_level_0,maker,type
model,Unnamed: 1_level_1,Unnamed: 2_level_1
1121,B,PC
1232,A,PC
1233,A,PC
1260,E,PC
1276,A,Printer


In [244]:
laptop = pd.read_csv('data/laptop.csv', index_col='code')
laptop.head()

Unnamed: 0_level_0,model,speed,ram,hd,price,screen
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1298,350,32,4.0,700,11
2,1321,500,64,8.0,970,12
3,1750,750,128,12.0,1200,14
4,1298,600,64,10.0,1050,15
5,1752,750,128,10.0,1150,14


In [245]:
pc = pd.read_csv('data/pc.csv', index_col='code')
pc.head()

Unnamed: 0_level_0,model,speed,ram,hd,cd,price
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1
1,1232,500,64,5.0,12x,600
2,1121,750,128,14.0,40x,850
3,1233,500,64,5.0,12x,600
4,1121,600,128,14.0,40x,850
5,1121,600,128,8.0,40x,850


In [246]:
printer = pd.read_csv('data/printer.csv', index_col='code')
printer.head()

Unnamed: 0_level_0,model,color,type,price
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1276,n,Laser,400
2,1433,y,Jet,270
3,1434,y,Jet,290
4,1401,n,Matrix,150
5,1408,n,Matrix,270


### Task 1
Для каждого производителя, выпускающего ноутбуки c объёмом жесткого диска не менее 10 Гбайт, найти скорости таких ноутбуков. Вывод: производитель, скорость. 

 ![](computers.png)

### SQL

In [249]:
query = '''
    select
        product.maker, laptop.speed
    from product join laptop 
    on product.model = laptop.model and laptop.hd >= 10
    '''
ps.sqldf(query)

Unnamed: 0,maker,speed
0,B,750
1,A,600
2,A,750
3,A,450


### Pandas

In [94]:
product_laptop = product.merge(laptop, left_index=True, right_on='model')
product_laptop

Unnamed: 0_level_0,maker,type,model,speed,ram,hd,price,screen
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
1,A,Laptop,1298,350,32,4.0,700,11
4,A,Laptop,1298,600,64,10.0,1050,15
6,A,Laptop,1298,450,64,10.0,950,12
2,C,Laptop,1321,500,64,8.0,970,12
3,B,Laptop,1750,750,128,12.0,1200,14
5,A,Laptop,1752,750,128,10.0,1150,14


In [95]:
product_laptop[product_laptop['hd'] >= 10][['maker', 'speed']]

Unnamed: 0_level_0,maker,speed
code,Unnamed: 1_level_1,Unnamed: 2_level_1
4,A,600
6,A,450
3,B,750
5,A,750


### Task 2

Найдите производителей, выпускающих по меньшей мере три различных модели ПК. Вывести: Maker, число моделей ПК.

 ![](computers.png)

### SQL

In [96]:
query = '''
    select
        maker,
        count(model) as count
    from product
    where type = 'PC'
    group by maker
    having count >= 3
    '''
data = ps.sqldf(query)
data.set_index('maker')

Unnamed: 0_level_0,count
maker,Unnamed: 1_level_1
E,3


### Pandas

In [97]:
maker = product[product['type'] == 'PC'].groupby('maker').count()
maker = maker.rename(columns = {'type':'count'})
maker

Unnamed: 0_level_0,count
maker,Unnamed: 1_level_1
A,2
B,1
E,3


In [98]:
maker[maker['count'] >= 3]

Unnamed: 0_level_0,count
maker,Unnamed: 1_level_1
E,3


### Task 3
Найдите производителя (единственного) самых дешевых цветных принтеров. Вывести: maker, price 

 ![](computers.png)

### SQL

In [257]:
query = '''
    select
        maker,
        min(price)
    from product join printer
    on product.model = printer.model and printer.color = 'y'
'''
ps.sqldf(query)

Unnamed: 0,maker,min(price)
0,D,270


In [271]:
query = '''
    select
        maker,
        price
    from product join printer
    on product.model = printer.model and printer.color = 'y'
        and price = (select min(price) from printer where color = 'y')
'''
ps.sqldf(query)

Unnamed: 0,maker,price
0,D,270


### Pandas

In [138]:
printer[printer['color'] == 'y'][['model', 'price']]

Unnamed: 0_level_0,model,price
code,Unnamed: 1_level_1,Unnamed: 2_level_1
2,1433,270
3,1434,290


In [258]:
product[product['type']=='Printer'][['maker']]

Unnamed: 0_level_0,maker
model,Unnamed: 1_level_1
1276,A
1288,D
1401,A
1408,A
1433,D
1434,E


In [259]:
data = product[product['type']=='Printer'][['maker']].merge(
    printer[printer['color']=='y'][['model', 'price']], left_index=True, right_on='model')
data

Unnamed: 0_level_0,maker,model,price
code,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
2,D,1433,270
3,E,1434,290


In [266]:
data[data['price'] == data['price'].min()][['maker', 'price']]

Unnamed: 0_level_0,maker,price
code,Unnamed: 1_level_1,Unnamed: 2_level_1
2,D,270


### Task 4
Найдите модели ноутбуков, скорость которых меньше скорости любого из ПК. Вывести: model, speed.

 ![](computers.png)

### SQL

In [285]:
query = '''
    select
        model,
        speed
    from laptop
    where speed < (select min(pc.speed) from pc)
'''
ps.sqldf(query)

Unnamed: 0,model,speed
0,1298,350


### Pandas

In [286]:
laptop[laptop['speed'] < pc['speed'].min()][['model', 'speed']]

Unnamed: 0_level_0,model,speed
code,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1298,350


### Task 5

Найдите пары моделей PC, имеющих одинаковые скорость и RAM. В результате каждая пара указывается только один раз, т.е. (i,j), но не (j,i), Порядок вывода: модель с большим номером, модель с меньшим номером, скорость и RAM.

 ![](computers.png)

### SQL

In [288]:
query = '''
    select
        a.model,
        b.model,
        a.speed,
        a.ram
    from pc as a join pc as b
    on a.speed = b.speed and a.ram = b.ram and a.model < b.model
'''
ps.sqldf(query)

Unnamed: 0,model,model.1,speed,ram
0,1232,1233,500,64
1,1121,1233,750,128
2,1232,1260,500,32


### Pandas

In [206]:
data = pc.merge(pc, on=['speed', 'ram'])
data.iloc[:4]

Unnamed: 0,model_x,speed,ram,hd_x,cd_x,price_x,model_y,hd_y,cd_y,price_y
0,1232,500,64,5.0,12x,600,1232,5.0,12x,600
1,1232,500,64,5.0,12x,600,1233,5.0,12x,600
2,1233,500,64,5.0,12x,600,1232,5.0,12x,600
3,1233,500,64,5.0,12x,600,1233,5.0,12x,600


In [207]:
data[(data['model_x'] > data['model_y'])][['model_x', 'model_y', 'speed', 'ram']]

Unnamed: 0,model_x,model_y,speed,ram
2,1233,1232,500,64
6,1233,1121,750,128
14,1260,1232,500,32


### Task 6

Найти производителей, которые выпускают более одной модели, при этом все выпускаемые производителем модели являются продуктами одного типа.
Вывести: maker, type 

 ![](computers.png)

### SQL

In [20]:
query = '''
    select maker, type
    from product
    group by maker
    having count(distinct type) = 1 and count(model) > 1
'''
ps.sqldf(query)

Unnamed: 0,maker,type
0,D,Printer


### Pandas

In [305]:
more_one = product.groupby('maker').count() > 1
more_one

Unnamed: 0_level_0,type
maker,Unnamed: 1_level_1
A,True
B,True
C,False
D,True
E,True


In [306]:
unique = product.groupby('maker').nunique()[['type']] == 1
unique

Unnamed: 0_level_0,type
maker,Unnamed: 1_level_1
A,False
B,False
C,True
D,True
E,False


In [310]:
x = unique & more_one
x

Unnamed: 0_level_0,type
maker,Unnamed: 1_level_1
A,False
B,False
C,False
D,True
E,False


In [23]:
idxs = product[['maker']].merge(
    more_one & unique, right_index=True, left_on='maker')['type']

In [24]:
product.loc[idxs].drop_duplicates()

Unnamed: 0_level_0,maker,type
model,Unnamed: 1_level_1,Unnamed: 2_level_1
1288,D,Printer


### Task 7

Задача по предсказанию, сколько потратит клиент в свой первый визит на следующей неделе.

### Pandas

In [27]:
# load data
train = pd.read_csv('data/market.csv')
train[4:8]

Unnamed: 0,id,date,sum
4,1,15,8
5,1,17,6
6,1,23,2
7,1,24,7


In [28]:
# weights by date for weighted mode 
train['weights'] = (train.date / train.date.max()) ** 1.2
train[4:8]

Unnamed: 0,id,date,sum,weights
4,1,15,8,0.01744
5,1,17,6,0.020266
6,1,23,2,0.029128
7,1,24,7,0.030654


In [29]:
byw = train.groupby(['id', 'sum'])[['weights']].sum()
byw.loc[1:1]

Unnamed: 0_level_0,Unnamed: 1_level_0,weights
id,sum,Unnamed: 2_level_1
1,1,26.984579
1,2,14.208423
1,3,5.630956
1,4,5.456149
1,5,11.246348
1,6,15.971761
1,7,18.31788
1,8,5.361063


In [30]:
ind = byw.reset_index().groupby('id')['weights'].apply(np.argmax)
ind[:4]

id
1     0
2    11
3    16
4    24
Name: weights, dtype: int64

In [31]:
prediction = byw.iloc[ind]
prediction[:4]

Unnamed: 0_level_0,Unnamed: 1_level_0,weights
id,sum,Unnamed: 2_level_1
1,1,26.984579
2,4,19.809903
3,5,10.248162
4,6,13.086052


In [32]:
prediction.reset_index('sum', inplace=True)
prediction[:4]

Unnamed: 0_level_0,sum,weights
id,Unnamed: 1_level_1,Unnamed: 2_level_1
1,1,26.984579
2,4,19.809903
3,5,10.248162
4,6,13.086052


In [33]:
# calculate last day of visit
train['delta_day'] = train['date'].diff()
train[:4]

Unnamed: 0,id,date,sum,weights,delta_day
0,1,2,6,0.001554,
1,1,3,7,0.002528,1.0
2,1,8,5,0.008202,5.0
3,1,10,7,0.010721,2.0


In [34]:
# calculate mean last day
prediction['delta_day'] = train.groupby('id')['delta_day'].mean()
prediction[:4]

Unnamed: 0_level_0,sum,weights,delta_day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1
1,1,26.984579,2.164179
2,4,19.809903,-0.04065
3,5,10.248162,-1.520833
4,6,13.086052,0.905882


In [35]:
# calculate mean day between visits
prediction['last_day'] = train.groupby('id').tail(1).reset_index().set_index('id')['date']
prediction[:4]

Unnamed: 0_level_0,sum,weights,delta_day,last_day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1,26.984579,2.164179,437
2,4,19.809903,-0.04065,432
3,5,10.248162,-1.520833,359
4,6,13.086052,0.905882,436


In [36]:
# parameters
f_delta = -4
f_minus = -7
f_plus = 7
last_day = 438

# filter: long time without visits
# filter: estimated visit before our week
# filter: estimated visit after our week
def make_zero(row):
    if ((row['last_day'] - last_day < f_minus) 
            or (row['last_day'] + row['delta_day'] - last_day < f_delta)
            or (row['last_day'] + row['delta_day'] - last_day > f_plus)):
        return 0
    else:
        return row['sum']

prediction['sum'] = prediction.apply(make_zero, axis=1)
prediction[:4]

Unnamed: 0_level_0,sum,weights,delta_day,last_day
id,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1
1,1.0,26.984579,2.164179,437
2,0.0,19.809903,-0.04065,432
3,0.0,10.248162,-1.520833,359
4,6.0,13.086052,0.905882,436


In [40]:
# saving
submit = prediction[['sum']]
submit

Unnamed: 0_level_0,sum
id,Unnamed: 1_level_1
1,1.0
2,0.0
3,0.0
4,6.0
5,2.0
6,4.0
7,1.0
8,5.0
9,3.0
10,0.0


In [41]:
submit.to_csv('data/submit.csv')