## Обзор и препроцессинг сгенерированных данных на PySpark

In [83]:
import findspark
findspark.init()
findspark.find()
import pyspark
from pyspark.sql import SparkSession
from pyspark.ml.feature import OneHotEncoder
from pyspark.ml.feature import MinMaxScaler
from pyspark.ml.feature import VectorAssembler
from pyspark.sql.functions import dayofweek
from pyspark.sql.functions import hour

In [10]:
!hdfs dfs -ls /user/testdata/

Found 43 items
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 07:47 /user/testdata/01_04_2019-06_04_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 07:52 /user/testdata/01_05_2019-06_05_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 08:23 /user/testdata/03_11_2019-08_11_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 08:13 /user/testdata/04_09_2019-09_09_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 08:18 /user/testdata/04_10_2019-09_10_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 08:08 /user/testdata/05_08_2019-10_08_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 07:58 /user/testdata/06_06_2019-11_06_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 08:03 /user/testdata/06_07_2019-11_07_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 07:48 /user/testdata/07_04_2019-12_04_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 07:54 /user/testdata/07_05_2019-12_05_2019
drwxr-xr-x   - ubuntu hadoop      

In [12]:
!hdfs dfs -ls /user/testdata/01_04_2019-06_04_2019

Found 3 items
-rw-r--r--   1 ubuntu hadoop      51401 2022-12-02 07:47 /user/testdata/01_04_2019-06_04_2019/customers.csv
-rw-r--r--   1 ubuntu hadoop       3976 2022-12-02 07:47 /user/testdata/01_04_2019-06_04_2019/terminals.csv
-rw-r--r--   1 ubuntu hadoop     168599 2022-12-02 07:47 /user/testdata/01_04_2019-06_04_2019/transactions.csv


In [19]:
spark = SparkSession\
        .builder\
        .appName("trans_feature_engineering")\
        .getOrCreate()

spark.conf.set('spark.sql.repl.eagerEval.enabled', True)  # to pretty print pyspark.DataFrame in jupyter

In [47]:
customers = spark.read.csv('/user/testdata/01_04_2019-06_04_2019/customers.csv', inferSchema=True, header=True)
terminals = spark.read.csv('/user/testdata/01_04_2019-06_04_2019/terminals.csv', inferSchema=True, header=True)
transactions = spark.read.csv('/user/testdata/01_04_2019-06_04_2019/transactions.csv', inferSchema=True, header=True)

### Обзор customers

In [16]:
customers.printSchema()

root
 |-- CUSTOMER_ID: integer (nullable = true)
 |-- x_customer_id: double (nullable = true)
 |-- y_customer_id: double (nullable = true)
 |-- mean_amount: double (nullable = true)
 |-- std_amount: double (nullable = true)
 |-- mean_nb_tx_per_day: double (nullable = true)
 |-- available_terminals: string (nullable = true)
 |-- nb_terminals: integer (nullable = true)



In [21]:
customers.limit(10)

CUSTOMER_ID,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,available_terminals,nb_terminals
0,54.88135039273247,71.51893663724195,62.262520726806166,31.131260363403083,2.1795327319875875,"[29, 87]",2
1,42.36547993389047,64.58941130666561,46.57078506995579,23.28539253497789,3.567092003128319,[5],1
2,96.36627605010293,38.34415188257777,80.21387861785314,40.10693930892657,2.115579679011618,[],0
3,56.80445610939323,92.5596638292661,11.74842552879926,5.87421276439963,0.3485171988061628,"[65, 94]",2
4,2.021839744032572,83.2619845547938,78.9248913402358,39.4624456701179,3.4800485929872766,[],0
5,97.8618342232764,79.91585642167236,48.84053941402853,24.420269707014263,3.122116705145822,[79],1
6,11.827442586893325,63.99210213275238,18.61856230385941,9.309281151929705,3.778675668198336,[],0
7,52.184832175007166,41.46619399905236,30.132783149939563,15.06639157496978,3.0969347577368667,[],0
8,45.615033221654855,56.84339488686485,6.785031041453738,3.392515520726869,2.470541988303508,"[8, 46]",2
9,61.20957227224214,61.69339968747569,94.6560674588893,47.32803372944465,2.7272811964139336,[84],1


### Обзор terminals

In [17]:
terminals.printSchema()

root
 |-- TERMINAL_ID: integer (nullable = true)
 |-- x_terminal_id: double (nullable = true)
 |-- y_terminal_id: double (nullable = true)



In [22]:
terminals.limit(10)

TERMINAL_ID,x_terminal_id,y_terminal_id
0,41.7022004702574,72.0324493442158
1,0.0114374817344886,30.233257263183976
2,14.675589081711305,9.23385947687978
3,18.62602113776709,34.556072704304775
4,39.67674742306699,53.88167340033569
5,41.91945144032948,68.52195003967594
6,20.445224973151745,87.81174363909454
7,2.7387593197926163,67.04675101784022
8,41.73048023671269,55.86898284457517
9,14.038693859523375,19.81014890848788


### Обзор transactions

In [18]:
transactions.printSchema()

root
 |-- TRANSACTION_ID: integer (nullable = true)
 |-- TX_DATETIME: string (nullable = true)
 |-- CUSTOMER_ID: integer (nullable = true)
 |-- TERMINAL_ID: integer (nullable = true)
 |-- TX_AMOUNT: double (nullable = true)
 |-- TX_TIME_SECONDS: integer (nullable = true)
 |-- TX_TIME_DAYS: integer (nullable = true)
 |-- TX_FRAUD: integer (nullable = true)
 |-- TX_FRAUD_SCENARIO: integer (nullable = true)



In [23]:
transactions.limit(10)

TRANSACTION_ID,TX_DATETIME,CUSTOMER_ID,TERMINAL_ID,TX_AMOUNT,TX_TIME_SECONDS,TX_TIME_DAYS,TX_FRAUD,TX_FRAUD_SCENARIO
0,2019-04-01 00:32:35,183,47,39.3,1955,0,0,0
1,2019-04-01 00:43:59,382,43,15.35,2639,0,0,0
2,2019-04-01 00:45:51,381,58,23.15,2751,0,0,0
3,2019-04-01 00:57:25,426,50,82.58,3445,0,0,0
4,2019-04-01 01:11:00,8,8,2.08,4260,0,0,0
5,2019-04-01 01:26:30,408,60,23.41,5190,0,0,0
6,2019-04-01 01:38:25,230,33,18.93,5905,0,0,0
7,2019-04-01 01:55:28,474,18,10.67,6928,0,0,0
8,2019-04-01 01:56:23,398,37,8.8,6983,0,0,0
9,2019-04-01 01:56:44,55,81,35.06,7004,0,0,0


### Препроцессинг данных

In [48]:
# Удаляем вспомогательные при генерации столбцы, чтобы не было ликов при обучении
customers = customers.drop("available_terminals","nb_terminals")
transactions = transactions.drop("TX_FRAUD_SCENARIO")

In [49]:
# Удаляем мусорные признаки
transactions = transactions.drop("TX_TIME_SECONDS", "TX_TIME_DAYS")

In [50]:
# Сводим все в одну таблицу
result = transactions.join(customers, transactions.CUSTOMER_ID == customers.CUSTOMER_ID, "left")
result = result.join(terminals, result.TERMINAL_ID == terminals.TERMINAL_ID, "left")

In [71]:
# Работаем с временными признаками
result = result.withColumn('day_of_week', dayofweek(result.TX_DATETIME))
result = result.withColumn('hour', hour(result.TX_DATETIME))
result = result.drop("TX_DATETIME")

In [73]:
# Удаляем потенциально полезные признаки (надо проверить), чтобы не раздувать пространство
result = result.drop("TRANSACTION_ID", "CUSTOMER_ID", "TERMINAL_ID")

In [74]:
result.limit(10)

TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,x_terminal_id,y_terminal_id,day_of_week,hour
39.3,0,36.49118360212381,26.090449938105976,52.11717806047961,26.058589030239805,2.726959780277445,40.81368027612812,23.70269802430277,2,0
15.35,0,42.83785131058563,92.31590211737402,14.983995953654924,7.491997976827462,3.930295554723651,42.80911898712949,96.48400471483856,2,0
23.15,0,77.05440616163654,90.8248379234579,19.28322690278018,9.64161345139009,2.233133696676994,75.38761884612464,92.30245355464834,2,0
82.58,0,31.019549824420316,51.54330866863324,44.51556488982905,22.257782444914525,0.9250198119187538,32.664490177209615,52.70581022576093,2,0
2.08,0,45.615033221654855,56.84339488686485,6.785031041453738,3.392515520726869,2.470541988303508,41.73048023671269,55.86898284457517,2,1
23.41,0,4.276313794779885,0.036734375145786,31.0045997919041,15.50229989595205,1.8483901185097995,1.9880133839795588,2.621098687771928,2,1
18.93,0,65.73189166171419,51.732608351608015,51.071736290016695,25.535868145008347,3.6046486825966464,66.37946452197887,51.48891120583086,2,1
10.67,0,71.81865260891838,80.19572403734452,7.500526851988695,3.750263425994348,2.875515661085698,68.65009276815837,83.46256718973729,2,1
8.8,0,37.305452930520325,19.68520546653137,14.382189245102728,7.191094622551364,2.9944240233183117,39.76768369855336,16.53541971169328,2,1
35.06,0,62.89818435911487,87.26506554473953,30.9864933074854,15.493246653742698,3.192187335650255,61.99557183813798,82.89808995501787,2,1


In [77]:
# one-hot преобразование hour
hour_encoder = OneHotEncoder(inputCol="hour", outputCol="hour_encoded")
hour_encoder_model = hour_encoder.fit(result)
result = hour_encoder_model.transform(result)

IllegalArgumentException: requirement failed: Column hour_encoded already exists.

In [79]:
# one-hot преобразование day_of_week
day_encoder = OneHotEncoder(inputCol="day_of_week", outputCol="day_of_week_encoded")
day_encoder_model = day_encoder.fit(result)
result = day_encoder_model.transform(result)

In [84]:
# Нормализация оставшихся признаков
amount_assembler = VectorAssembler(inputCols=["TX_AMOUNT"], outputCol="TX_AMOUNT_v")
result = amount_assembler.transform(result)

amount_scaler = MinMaxScaler(inputCol="TX_AMOUNT_v", outputCol="TX_AMOUNT_scaled")
amount_scaler_model = amount_scaler.fit(result)
result = amount_scaler_model.transform(result)

x_cus_assembler = VectorAssembler(inputCols=["x_customer_id"], outputCol="x_customer_id_v")
result = x_cus_assembler.transform(result)

x_cus_scaler = MinMaxScaler(inputCol="x_customer_id_v", outputCol="x_customer_id_scaled")
x_cus_scaler_model = x_cus_scaler.fit(result)
result = x_cus_scaler_model.transform(result)

y_cus_assembler = VectorAssembler(inputCols=["y_customer_id"], outputCol="y_customer_id_v")
result = y_cus_assembler.transform(result)

y_cus_scaler = MinMaxScaler(inputCol="y_customer_id_v", outputCol="y_customer_id_scaled")
y_cus_scaler_model = y_cus_scaler.fit(result)
result = y_cus_scaler_model.transform(result)

mean_amount_assembler = VectorAssembler(inputCols=["mean_amount"], outputCol="mean_amount_v")
result = mean_amount_assembler.transform(result)

mean_amount_scaler = MinMaxScaler(inputCol="mean_amount_v", outputCol="mean_amount_scaled")
mean_amount_scaler_model = mean_amount_scaler.fit(result)
result = mean_amount_scaler_model.transform(result)

std_amount_assembler = VectorAssembler(inputCols=["std_amount"], outputCol="std_amount_v")
result = std_amount_assembler.transform(result)

std_amount_scaler = MinMaxScaler(inputCol="std_amount_v", outputCol="std_amount_scaled")
std_amount_scaler_model = std_amount_scaler.fit(result)
result = std_amount_scaler_model.transform(result)

mean_nb_amount_assembler = VectorAssembler(inputCols=["mean_nb_tx_per_day"], outputCol="mean_nb_tx_per_day_v")
result = mean_nb_amount_assembler.transform(result)

mean_nb_amount_scaler = MinMaxScaler(inputCol="mean_nb_tx_per_day_v", outputCol="mean_nb_tx_per_day_scaled")
mean_nb_amount_scaler_model = mean_nb_amount_scaler.fit(result)
result = mean_nb_amount_scaler_model.transform(result)

x_ter_assembler = VectorAssembler(inputCols=["x_terminal_id"], outputCol="x_terminal_id_v")
result = x_ter_assembler.transform(result)

x_ter_scaler = MinMaxScaler(inputCol="x_terminal_id_v", outputCol="x_terminal_id_scaled")
x_ter_scaler_model = x_ter_scaler.fit(result)
result = x_ter_scaler_model.transform(result)

y_ter_assembler = VectorAssembler(inputCols=["y_terminal_id"], outputCol="y_terminal_id_v")
result = y_ter_assembler.transform(result)

y_ter_scaler = MinMaxScaler(inputCol="y_terminal_id_v", outputCol="y_terminal_id_scaled")
y_ter_scaler_model = y_ter_scaler.fit(result)
result = y_ter_scaler_model.transform(result)

In [102]:
# Собираем все признаки вместе
features_assembler = VectorAssembler(inputCols=[
    "hour_encoded",
    "day_of_week_encoded",
    "TX_AMOUNT_scaled",
    "x_customer_id_scaled",
    "y_customer_id_scaled",
    "mean_amount_scaled",
    "std_amount_scaled",
    "mean_nb_tx_per_day_scaled",
    "x_terminal_id_scaled",
    "y_terminal_id_scaled"
    ],
    outputCol="Features",
)

result = features_assembler.transform(result)

In [104]:
result.limit(10)

TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,x_terminal_id,y_terminal_id,day_of_week,hour,hour_encoded,day_of_week_encoded,TX_AMOUNT_v,TX_AMOUNT_scaled,x_customer_id_v,x_customer_id_scaled,y_customer_id_v,y_customer_id_scaled,mean_amount_v,mean_amount_scaled,std_amount_v,std_amount_scaled,mean_nb_tx_per_day_v,mean_nb_tx_per_day_scaled,x_terminal_id_v,x_terminal_id_scaled,y_terminal_id_v,y_terminal_id_scaled,Features
39.3,0,36.49118360212381,26.090449938105976,52.11717806047961,26.058589030239805,2.726959780277445,40.81368027612812,23.70269802430277,2,0,"(23,[0],[1.0])","(7,[2],[1.0])",[39.3],[0.05700767745961...,[36.49118360212381],[0.3686956504910844],[26.090449938105976],[0.2609338630841068],[52.11717806047961],[0.4957302436435352],[26.058589030239805],[0.4957302436435352],[2.726959780277445],[0.6807323241738981],[40.81368027612812],[0.4110246174713778],[23.70269802430277],[0.22794328978316...,"(38,[0,25,30,31,3..."
15.35,0,42.83785131058563,92.31590211737402,14.983995953654924,7.491997976827462,3.930295554723651,42.80911898712949,96.48400471483856,2,0,"(23,[0],[1.0])","(7,[2],[1.0])",[15.35],[0.02224866841792...,[42.83785131058563],[0.43329899332362...,[92.31590211737402],[0.9241967681338312],[14.983995953654924],[0.10451990712158...,[7.491997976827462],[0.10451990712158...,[3.930295554723651],[0.9832049076436719],[42.809118987129494],[0.43126252220799...,[96.48400471483856],[0.9670147277665311],"(38,[0,25,30,31,3..."
23.15,0,77.05440616163654,90.8248379234579,19.28322690278018,9.64161345139009,2.233133696676994,75.38761884612464,92.30245355464834,2,0,"(23,[0],[1.0])","(7,[2],[1.0])",[23.15],[0.03356893023525...,[77.05440616163654],[0.7815926058352345],[90.8248379234579],[0.909263422473988],[19.28322690278018],[0.14981372496769...,[9.64161345139009],[0.14981372496769...,[2.233133696676994],[0.5566033366848235],[75.38761884612464],[0.7616763670297363],[92.30245355464834],[0.9245523817399148],"(38,[0,25,30,31,3..."
82.58,0,31.019549824420316,51.54330866863324,44.51556488982905,22.257782444914525,0.9250198119187538,32.664490177209615,52.70581022576093,2,0,"(23,[0],[1.0])","(7,[2],[1.0])",[82.58],[0.11982061738966...,[31.019549824420313],[0.3129993624494391],[51.54330866863324],[0.5158500088845376],[44.51556488982905],[0.4156447385003248],[22.257782444914525],[0.4156447385003248],[0.9250198119187538],[0.2277935442745549],[32.664490177209615],[0.3283748562969536],[52.70581022576093],[0.5224608357770447],"(38,[0,25,30,31,3..."
2.08,0,45.615033221654855,56.84339488686485,6.785031041453738,3.392515520726869,2.470541988303508,41.73048023671269,55.86898284457517,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[2.08],[0.00298971017227...,[45.615033221654855],[0.4615681958958572],[56.84339488686485],[0.5689315729424542],[6.785031041453738],[0.01814110410905...,[3.392515520726869],[0.01814110410905...,[2.4705419883035082],[0.6162786998131782],[41.730480236712694],[0.4203228786319676],[55.868982844575164],[0.5545818657815892],"(38,[1,25,30,31,3..."
23.41,0,4.276313794779885,0.036734375145786,31.0045997919041,15.50229989595205,1.8483901185097995,1.9880133839795588,2.621098687771928,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[23.41],[0.03394627229583...,[4.276313794779885],[0.04077738313091...,[0.03673437514578...,[0.0],[31.0045997919041],[0.27330225927272...,[15.50229989595205],[0.27330225927272...,[1.8483901185097995],[0.45989351841718...,[1.9880133839795588],[0.01725148699799...,[2.621098687771928],[0.01386623284204...,"(38,[1,25,30,31,3..."
18.93,0,65.73189166171419,51.732608351608015,51.071736290016695,25.535868145008347,3.6046486825966464,66.37946452197887,51.48891120583086,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[18.93],[0.02744437832895...,[65.73189166171419],[0.6663396385198563],[51.732608351608015],[0.5177458880899369],[51.071736290016695],[0.484716169340387],[25.535868145008347],[0.484716169340387],[3.6046486825966464],[0.9013497406962444],[66.37946452197887],[0.6703149195146254],[51.48891120583086],[0.5101036062982668],"(38,[1,25,30,31,3..."
10.67,0,71.81865260891838,80.19572403734452,7.500526851988695,3.750263425994348,2.875515661085698,68.65009276815837,83.46256718973729,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[10.67],[0.01545651132751...,[71.81865260891838],[0.7282973653577823],[80.19572403734452],[0.8028104391840541],[7.500526851988695],[0.02567908876154...,[3.7502634259943477],[0.02567908876154...,[2.875515661085698],[0.7180735900216448],[68.65009276815837],[0.693343819317508],[83.46256718973729],[0.834786096626296],"(38,[1,25,30,31,3..."
8.8,0,37.305452930520325,19.68520546653137,14.382189245102728,7.191094622551364,2.9944240233183117,39.76768369855336,16.53541971169328,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[8.8],[0.01274255112259...,[37.305452930520325],[0.37698417653291...,[19.685205466531375],[0.1967838887770954],[14.382189245102728],[0.09817967470367...,[7.191094622551364],[0.09817967470367...,[2.9944240233183113],[0.7479626037819235],[39.767683698553355],[0.4004160335168457],[16.53541971169328],[0.1551618170424977],"(38,[1,25,30,31,3..."
35.06,0,62.89818435911487,87.26506554473953,30.9864933074854,15.493246653742698,3.192187335650255,61.99557183813798,82.89808995501787,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[35.06],[0.05085409924096...,[62.89818435911487],[0.6374950585345477],[87.26506554473953],[0.8736114957369192],[30.986493307485397],[0.2731115014798045],[15.493246653742698],[0.2731115014798045],[3.192187335650255],[0.7976727356228155],[61.99557183813798],[0.6258531166930369],[82.89808995501787],[0.8290540066083628],"(38,[1,25,30,31,3..."


### Сохранение данных в формате parquet

In [109]:
result.write.parquet('/user/processed_data/03_11_2019-08_11_2019/processed.parquet')

### Проверка, что сохраненный spark'ом датафрейм номально открывается

In [110]:
!hdfs dfs -ls /user/processed_data/

Found 3 items
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 10:04 /user/processed_data/01_04_2019-06_04_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 10:05 /user/processed_data/01_05_2019-06_05_2019
drwxr-xr-x   - ubuntu hadoop          0 2022-12-02 10:07 /user/processed_data/03_11_2019-08_11_2019


In [111]:
data_processed = spark.read.parquet('/user/processed_data/03_11_2019-08_11_2019/processed.parquet')
data_processed.limit(10)

TX_AMOUNT,TX_FRAUD,x_customer_id,y_customer_id,mean_amount,std_amount,mean_nb_tx_per_day,x_terminal_id,y_terminal_id,day_of_week,hour,hour_encoded,day_of_week_encoded,TX_AMOUNT_v,TX_AMOUNT_scaled,x_customer_id_v,x_customer_id_scaled,y_customer_id_v,y_customer_id_scaled,mean_amount_v,mean_amount_scaled,std_amount_v,std_amount_scaled,mean_nb_tx_per_day_v,mean_nb_tx_per_day_scaled,x_terminal_id_v,x_terminal_id_scaled,y_terminal_id_v,y_terminal_id_scaled,Features
39.3,0,36.49118360212381,26.090449938105976,52.11717806047961,26.058589030239805,2.726959780277445,40.81368027612812,23.70269802430277,2,0,"(23,[0],[1.0])","(7,[2],[1.0])",[39.3],[0.05700767745961...,[36.49118360212381],[0.3686956504910844],[26.090449938105976],[0.2609338630841068],[52.11717806047961],[0.4957302436435352],[26.058589030239805],[0.4957302436435352],[2.726959780277445],[0.6807323241738981],[40.81368027612812],[0.4110246174713778],[23.70269802430277],[0.22794328978316...,"(38,[0,25,30,31,3..."
15.35,0,42.83785131058563,92.31590211737402,14.983995953654924,7.491997976827462,3.930295554723651,42.80911898712949,96.48400471483856,2,0,"(23,[0],[1.0])","(7,[2],[1.0])",[15.35],[0.02224866841792...,[42.83785131058563],[0.43329899332362...,[92.31590211737402],[0.9241967681338312],[14.983995953654924],[0.10451990712158...,[7.491997976827462],[0.10451990712158...,[3.930295554723651],[0.9832049076436719],[42.809118987129494],[0.43126252220799...,[96.48400471483856],[0.9670147277665311],"(38,[0,25,30,31,3..."
23.15,0,77.05440616163654,90.8248379234579,19.28322690278018,9.64161345139009,2.233133696676994,75.38761884612464,92.30245355464834,2,0,"(23,[0],[1.0])","(7,[2],[1.0])",[23.15],[0.03356893023525...,[77.05440616163654],[0.7815926058352345],[90.8248379234579],[0.909263422473988],[19.28322690278018],[0.14981372496769...,[9.64161345139009],[0.14981372496769...,[2.233133696676994],[0.5566033366848235],[75.38761884612464],[0.7616763670297363],[92.30245355464834],[0.9245523817399148],"(38,[0,25,30,31,3..."
82.58,0,31.019549824420316,51.54330866863324,44.51556488982905,22.257782444914525,0.9250198119187538,32.664490177209615,52.70581022576093,2,0,"(23,[0],[1.0])","(7,[2],[1.0])",[82.58],[0.11982061738966...,[31.019549824420313],[0.3129993624494391],[51.54330866863324],[0.5158500088845376],[44.51556488982905],[0.4156447385003248],[22.257782444914525],[0.4156447385003248],[0.9250198119187538],[0.2277935442745549],[32.664490177209615],[0.3283748562969536],[52.70581022576093],[0.5224608357770447],"(38,[0,25,30,31,3..."
2.08,0,45.615033221654855,56.84339488686485,6.785031041453738,3.392515520726869,2.470541988303508,41.73048023671269,55.86898284457517,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[2.08],[0.00298971017227...,[45.615033221654855],[0.4615681958958572],[56.84339488686485],[0.5689315729424542],[6.785031041453738],[0.01814110410905...,[3.392515520726869],[0.01814110410905...,[2.4705419883035082],[0.6162786998131782],[41.730480236712694],[0.4203228786319676],[55.868982844575164],[0.5545818657815892],"(38,[1,25,30,31,3..."
23.41,0,4.276313794779885,0.036734375145786,31.0045997919041,15.50229989595205,1.8483901185097995,1.9880133839795588,2.621098687771928,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[23.41],[0.03394627229583...,[4.276313794779885],[0.04077738313091...,[0.03673437514578...,[0.0],[31.0045997919041],[0.27330225927272...,[15.50229989595205],[0.27330225927272...,[1.8483901185097995],[0.45989351841718...,[1.9880133839795588],[0.01725148699799...,[2.621098687771928],[0.01386623284204...,"(38,[1,25,30,31,3..."
18.93,0,65.73189166171419,51.732608351608015,51.071736290016695,25.535868145008347,3.6046486825966464,66.37946452197887,51.48891120583086,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[18.93],[0.02744437832895...,[65.73189166171419],[0.6663396385198563],[51.732608351608015],[0.5177458880899369],[51.071736290016695],[0.484716169340387],[25.535868145008347],[0.484716169340387],[3.6046486825966464],[0.9013497406962444],[66.37946452197887],[0.6703149195146254],[51.48891120583086],[0.5101036062982668],"(38,[1,25,30,31,3..."
10.67,0,71.81865260891838,80.19572403734452,7.500526851988695,3.750263425994348,2.875515661085698,68.65009276815837,83.46256718973729,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[10.67],[0.01545651132751...,[71.81865260891838],[0.7282973653577823],[80.19572403734452],[0.8028104391840541],[7.500526851988695],[0.02567908876154...,[3.7502634259943477],[0.02567908876154...,[2.875515661085698],[0.7180735900216448],[68.65009276815837],[0.693343819317508],[83.46256718973729],[0.834786096626296],"(38,[1,25,30,31,3..."
8.8,0,37.305452930520325,19.68520546653137,14.382189245102728,7.191094622551364,2.9944240233183117,39.76768369855336,16.53541971169328,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[8.8],[0.01274255112259...,[37.305452930520325],[0.37698417653291...,[19.685205466531375],[0.1967838887770954],[14.382189245102728],[0.09817967470367...,[7.191094622551364],[0.09817967470367...,[2.9944240233183113],[0.7479626037819235],[39.767683698553355],[0.4004160335168457],[16.53541971169328],[0.1551618170424977],"(38,[1,25,30,31,3..."
35.06,0,62.89818435911487,87.26506554473953,30.9864933074854,15.493246653742698,3.192187335650255,61.99557183813798,82.89808995501787,2,1,"(23,[1],[1.0])","(7,[2],[1.0])",[35.06],[0.05085409924096...,[62.89818435911487],[0.6374950585345477],[87.26506554473953],[0.8736114957369192],[30.986493307485397],[0.2731115014798045],[15.493246653742698],[0.2731115014798045],[3.192187335650255],[0.7976727356228155],[61.99557183813798],[0.6258531166930369],[82.89808995501787],[0.8290540066083628],"(38,[1,25,30,31,3..."


In [1]:
!hdfs dfs -ls /user/processed_data

/bin/bash: строка 1: hdfs: команда не найдена
