In [36]:
import os
import dask
import dask.dataframe as dd
import dask.bag as db
import json
from collections import Counter

In [8]:
filenames = os.path.join(r'C:\Users\miair\Documents\Python\Лекции big data\archive\data', 'accounts.*.csv')

In [9]:
df = dd.read_csv(filenames, 
                    dtype = {'id' : int,
                            'names' : str,
                            'amount' : int})

In [67]:
df.head()

Unnamed: 0,id,names,amount
0,477,Alice,44
1,486,Alice,385
2,172,Yvonne,3711
3,365,Charlie,1880
4,196,Yvonne,423


In [69]:
df.dtypes

id         int32
names     object
amount     int32
dtype: object

Задача 1: вывести самое часто встречающееся имя

In [22]:
df2 = df.where(df.amount > 0).groupby('names').id.count().idxmax()

In [23]:
df2.compute()

'Charlie'

Задача 2 сделать новую колонку, которая является результатом от деления нацело на 100, если amount > 100 и нулём, если меньше

In [13]:
df4 = df
df4 = df4.assign(new = lambda df: df.amount// 100)
df4.new = df4.new.where(df4.new > 0 , 0)

In [11]:
df4.compute()

Unnamed: 0,id,names,amount,new
0,477,Alice,44,0
1,486,Alice,385,3
2,172,Yvonne,3711,37
3,365,Charlie,1880,18
4,196,Yvonne,423,4
5,230,Zelda,4360,43
6,22,Edith,1897,18
7,197,Quinn,1529,15
8,467,Xavier,85,0
9,9,Victor,999,9


Эффективно создать csv файлы, содержащие данные из файлов `accounts.*.csv` к которым добавлен столбец, в котором содержится количество сотен из положительного значения amount и 0, если значение в amount отрицательное. 

In [29]:
df4.to_csv('accounts_new.*.csv')

['accounts_new.0.csv',
 'accounts_new.1.csv',
 'accounts_new.2.csv',
 'accounts_new.3.csv',
 'accounts_new.4.csv',
 'accounts_new.5.csv']

Задача 3: посчитать 10 децилей колонки amount

In [5]:
%%time
dist = df.amount.quantile([0.1 * x for x in range(1,10)]).compute()
print(dist)

0.1    -156.0
0.2      34.0
0.3     141.0
0.4     287.0
0.5     495.0
0.6     826.0
0.7    1191.0
0.8    1731.0
0.9    2481.0
Name: amount, dtype: float64
Wall time: 2.71 s


Задача 4: найти пару идущих подряд чисел с наибольшей по модулю разностью. При этом важно применить rolling для перехода через партиции

In [29]:
df.head()

Unnamed: 0,id,names,amount
0,477,Alice,44
1,486,Alice,385
2,172,Yvonne,3711
3,365,Charlie,1880
4,196,Yvonne,423


In [30]:
%%time
mod = df.amount.map_overlap(lambda s: s.rolling(2).apply(lambda x: abs(x[0] - x[1])), 1, 0).idxmax().compute()
print(mod)

580971
Wall time: 9.83 s


Задача 5: загрузить файл в даск датафрейм и посчитать кол-во отзывов по оценкам

In [19]:
dfjson = dd.read_json('datasets/Toys_and_Games_5.json',
            dtype = {'asin' : str,
                            'helpful' : str,
                            'overall' : int,
                            'reviewText' : str,
                            'reviewTine' : 'datetime',
                            'reviewerID' : str,
                            'rewiewerName' : str,
                            'summary' : str,
                            'unixReviewTime' : str})
dfjson.head()

Unnamed: 0,asin,helpful,overall,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
0,439893577,"[0, 0]",5,I like the item pricing. My granddaughter want...,"01 29, 2014",A1VXOAVRGKGEAK,Angie,Magnetic board,1390953600
1,439893577,"[1, 1]",4,Love the magnet easel... great for moving to d...,"03 28, 2014",A8R62G708TSCM,Candace,it works pretty good for moving to different a...,1395964800
2,439893577,"[1, 1]",5,Both sides are magnetic. A real plus when you...,"01 28, 2013",A21KH420DK0ICA,capemaychristy,love this!,1359331200
3,439893577,"[0, 0]",5,Bought one a few years ago for my daughter and...,"02 8, 2014",AR29QK6HPFYZ4,dcrm,Daughters love it,1391817600
4,439893577,"[1, 1]",4,I have a stainless steel refrigerator therefor...,"05 5, 2014",ACCH8EOML6FN5,DoyZ,Great to have so he can play with his alphabet...,1399248000


In [7]:
dfjson.dtypes

asin              object
helpful           object
overall            int32
reviewText        object
reviewTime        object
reviewerID        object
reviewerName      object
summary           object
unixReviewTime    object
dtype: object

In [26]:
dfjson.groupby('overall').count()

Unnamed: 0_level_0,asin,helpful,reviewText,reviewTime,reviewerID,reviewerName,summary,unixReviewTime
npartitions=1,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1,Unnamed: 6_level_1,Unnamed: 7_level_1,Unnamed: 8_level_1
,int64,int64,int64,int64,int64,int64,int64,int64
,...,...,...,...,...,...,...,...


In [28]:
%%time
grouped = dfjson.groupby('overall').summary.count().compute()
print(grouped)

overall
1      4707
2      6298
3     16357
4     37445
5    102790
Name: summary, dtype: int64
Wall time: 9.13 s


Задача 6: посчиать кол-во отзывов по месяцам

In [40]:
Counter(dfjson[dfjson.reviewText.notnull()].reviewTime.str.slice(0, 2).compute())

Counter({'01': 27533,
         '02': 13925,
         '03': 12135,
         '04': 10795,
         '05': 10530,
         '06': 10187,
         '07': 10650,
         '08': 9150,
         '09': 9505,
         '10': 11249,
         '11': 13942,
         '12': 27996})