In [None]:
import pandas as pd
import numpy as np
import os
import gc
from tqdm import tqdm_notebook # check the progressbar in the python. 
import glob # check the file name in fold. 

In [None]:
import pyspark

import findspark
findspark.init()
findspark.find()

In [None]:
# C:\Users\User\Documents\R&D Challenge2019
path = 'C:/Users/User/Documents/R_D Challenge2019/Challenge19_GameBot_Preliminary/dataset/'

In [None]:
read_file_lst = glob.glob(path + '*')
read_file_lst[0:2]

In [None]:
from pyspark import SparkContext
sc = SparkContext()

In [None]:
import functools 

def unionAll(dfs):
    return functools.reduce(lambda df1,df2: df1.union(df2.select(df1.columns)), dfs) 

In [None]:
from pyspark.sql import SQLContext
from pyspark import SparkFiles
from pyspark.sql.functions import lit

sqlContext = SQLContext(sc)
for i in tqdm_notebook(read_file_lst):
    # sqlContext.read.csv() : path를 C:Users/..의 형식으로 맞춰줘야함. (초기값이 spark설치된 곳으로 지정되어있음)
    df_temp = sqlContext.read.csv(SparkFiles.get(i), header=True, inferSchema= True) # read.csv 
    
    # transformation으로 추가하려면, df = df.withColumn("age_square", col("age")**2)
    df_temp = df_temp.withColumn("Date", lit(i[-8:-4])) # column 추가, lit명령어가 value를 추가하는 방법. 
    
    if i == read_file_lst[0]:
        df_total = df_temp
    else:
        df_total = unionAll([df_total, df_temp]) # row-wise 결합
        
    del df_temp 
    gc.collect()

In [None]:
import pyspark.sql.functions as f
# userItem=df.groupby('userId').agg(f.expr('count(distinct item)').alias('n_item'))
# df_total.groupBy("log_id").agg(f.expr('count(distinct item)').alias('log_id'))
# df_total.select("actor_account").distinct().count()

In [None]:
i = 'C:/Users/User/Documents/R_D Challenge2019/Challenge19_GameBot_Preliminary/'
train_label = sqlContext.read.csv(SparkFiles.get(i + 'labeled_accounts.csv'), header=True, inferSchema= True)
test_label = sqlContext.read.csv(SparkFiles.get(i + 'test_accounts.csv'), header=True, inferSchema= True)

### Feature Engineering

Network measures features
- Party 
- Friend
- Trade 
- mail 
- dual 
- Private shop

Setting 
방법1. 
- 패키지 설치 : pip install graphframes
- 패키지 로드 : from graphframes import *
- graphframes example : https://towardsdatascience.com/graphframes-in-jupyter-a-practical-guide-9b3b346cebc5

방법2. 
하드코딩 
- In_degree : 나한테 들어오는 갯수 
- out_degree : 나에서 나가는 갯수 
- Eccentricity : The eccentricity of a node s is the longest shortest path d between this node and all other nodes t of the network:

방법3. 
- python : https://www.kirenz.com/post/2019-08-13-network_analysis/
- 이론 : http://blog.naver.com/PostView.nhn?blogId=happyrachy&logNo=221273644056&parentCategoryNo=&categoryNo=1&viewDate=&isShowPopularPosts=true&from=search
- 이론2 : https://bab2min.tistory.com/554

How to use graphframes in Jupyter notebook by referencing graphrames.jar
- https://github.com/graphframes/graphframes/issues/104#

In [None]:
import pyspark.sql.functions as func # pyspark의 유용한 기능을 사용하는 패키지 : countDistinct 사용가능
# 원하는 부분만 출력할때는 df.filter()를 사용. 

# 아이온데이터에 actor는 다르지만 actor_account가 같은 경우는 있는 것 같음. 
# 하지만 제출형태가 actor_account를 기준으로 해가지고 groupby를 아래의 변수로 진행했음. 
df_total = df_total.withColumnRenamed("actor_account", "account")

In [None]:
def social_network_features(train, test, df, log_id, name):
    # In-degree
    df_agg1 = df.filter(df['log_id'] == log_id).groupBy(['account']).agg({'account':'count'}).withColumnRenamed("count(account)", "{}_in_deg".format(name))
    # Out-degree
    df_agg2 = df.filter(df['log_id'] == log_id).groupBy(['target_account']).agg({'target_account':'count'}).withColumnRenamed("count(target_account)", "{}_out_deg".format(name))
    df_agg2 = df_agg2.withColumnRenamed("target_account", "account")
    # Python code로 짜야하는데 어려움 ㅜㅜ... 
    # package는 apache기준. pyspark의 package는 pagerank랑 Component_ID만 있음. 
    # cc, between, closeness, Eigenvector, Eccentricity, Authority, Hub, Pagerank 
    
    train = train.join(df_agg1, ['account'], how='left')
    test = test.join(df_agg1, ['account'], how='left')

    train = train.join(df_agg2, ['account'], how='left')
    test = test.join(df_agg2, ['account'], how='left')
    print("end..")
    return train, test

In [None]:
# party 
train_label, test_label = social_network_features(train_label, test_label, df_total, log_id = 126, name = 'p')

# Friend 
train_label, test_label = social_network_features(train_label, test_label, df_total, log_id = 134, name = 'f')

# Dual
train_label, test_label = social_network_features(train_label, test_label, df_total, log_id = 158, name = 'd')

# Mail 
train_label, test_label = social_network_features(train_label, test_label, df_total, log_id = 229, name = 'm')

# Trade1
train_label, test_label = social_network_features(train_label, test_label, df_total, log_id = 210, name = 't1')

# Trade2
train_label, test_label = social_network_features(train_label, test_label, df_total, log_id = 219, name = 't2')

# Private Shop
train_label, test_label = social_network_features(train_label, test_label, df_total, log_id = 247, name = 'p_s')

In [None]:
# C:\Users\User\Documents\R&D Challenge2019
outputpath = 'C:/Users/User/Documents/R_D Challenge2019/Challenge19_GameBot_Preliminary/features/'
train_label.to_csv(outputpath + "train_social.csv", index=False)
test_label.to_csv(outputpath + "test_social.csv", index=False)