In [1]:
import glob
import json
import os
import re
import pandas as pd
import numpy as np
from sklearn.preprocessing import MultiLabelBinarizer
from konlpy.tag import Mecab
from PIL import Image

# 1. 데이터 생성하기
  - (1) Scikit learn one-hot encoding(MultiLabelBinarizer)
  - (2) PyTorch(https://dacon.io/codeshare/2354)

In [2]:
path = "/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset"
original_ds = os.path.join(path, "raw_data/feed_data_20210630_2249(fixed_abnormal_extension)(전체_0~282268)(FACES, REALIMG).xlsx")
deidentification_path = os.path.join(path, "deidentification/deidentification_completed_20211123.xlsx")
json_dirs = [os.path.join(path, '10_211122(whole)')]

In [3]:
image_df_columns = ['index', 'seq', 'url', 'deidentification', 'labels'] #이미지 데이터프레임이 비식별화 여부 판별로 인해 1개 컬럼이 더 많음.
text_df_columns = ['index', 'seq', 'text', 'labels']

In [5]:
class DatasetGenerator:
    def __init__(self, path, raw_file, deidentification_file, json_dirs, colnames_image, colnames_text):
        self.path = path
        self.raw_file = raw_file
        self.deidentification_file = deidentification_file
        self.json_dirs = json_dirs
        self.colnames_image = colnames_image
        self.colnames_text = colnames_text

    def original_dataset(self):
        original_df = pd.read_excel(self.raw_file)
        original_df = original_df[['INDEX', 'SEQ', 'URL', 'FACES', 'TEXTDATA']]
        original_df.columns = ["index", "seq", "url", "deidentification", "text"]
        print(f'Columns of original data: {original_df.columns}')
        print(f'Length of original data: {len(original_df)}')
        return original_df

    
    def deidentification_dataset(self):
        deidentified_df = pd.DataFrame(columns=["index", "seq", "url", "deidentification"])
        data = pd.read_excel(self.deidentification_file)
        data = data[['INDEX', 'SEQ', 'URL', 'FACES']]
        data.columns = ["index", "seq", "url", "deidentification"]
        data = data.dropna(subset=["deidentification"])

        deidentified_df = pd.concat([deidentified_df, data], ignore_index=True)
        deidentified_df = deidentified_df.sort_values(by=['index'])
        print(f'Columns of deidentified_df: {deidentified_df.columns}')    
        print(f'Length of deidentified_df: \n{len(deidentified_df)}')
        return deidentified_df

    
    def merge_deidentification_to_original(self, original_df, deidentified_df):        
        deidentified_df['index'] = deidentified_df['index'].astype(int) #original_df의 자료형과 불일치하므로, merge 전 일치화
        
        merged_df = pd.merge(original_df, deidentified_df[['index', 'deidentification']], how="left", on="index")
        merged_df = merged_df.drop(columns=['deidentification_x'])  #NaN인 행 삭제
        merged_df.rename(columns = {'deidentification_y':'deidentification'}, inplace=True)
        
        #비식별화 결과값 통일 #n, y, NaN, x
        merged_df.loc[merged_df['deidentification'] == 'X', 'deidentification'] = 'x' #merged_df['deidentification']이 X이면 merged_df['deidentification']을 x로 변경
        merged_df.loc[merged_df['deidentification'] == 'Y', 'deidentification'] = 'y'
        merged_df.loc[merged_df['deidentification'] == 'y ', 'deidentification'] = 'y'
        
        print(f'Columns of merged_df: {merged_df.columns}')    
        print(f'Length of merged_df: \n{len(merged_df)}')
        return merged_df #columns = [index, feed_date, seq, user_pk, url, text, deidentification]


    def make_dataset_by_type(self, data_type, merged_df):
        mlb = MultiLabelBinarizer()
        if data_type == 'text-bert':
            rows = []
            for directory in self.json_dirs:
                json_list = os.listdir(directory)
                if '.DS_Store' in json_list: json_list.remove('.DS_Store')
                
                for jsonfile in json_list:
                    with open(os.path.join(self.path, directory, jsonfile), 'r', encoding='utf-8') as f:
                        file = json.load(f)
                        
                        text_seq = file['sourceData']['seq']
                        text_label = file["text_label"]
                        
                        features = ''
                        f.close()
                        
                    for feature in text_label:
                        if feature == 'dataIndex':
                            pass
                        else:
                            for value in text_label[feature]:
                                if value == "none":
                                    pass
                                else:
                                    features = features+value+'|' #split by '|'                                                 
                    
                    new_row = {'seq': text_seq, 'labels': features[:-1]}
                    rows.append(new_row)
                    if len(rows)%10000 == 0:
                        print(f"Now processed:{len(rows)}")

            result = pd.DataFrame(rows)
            result['labels'] = [x.split('|') for x in result['labels']]
            labels = mlb.fit_transform(result['labels'].values)
            print(f'labels: {mlb.classes_}, {labels}')
            new_result = pd.DataFrame(columns = mlb.classes_, data = labels)
            new_result.insert(0, 'seq', result['seq'])
            new_result = new_result.sort_values(by=['seq'])
            
            print(f'new_result: {len(new_result)}개, \n {new_result}')
            
            return new_result
        else:
            rows = []
            for directory in self.json_dirs:
                json_list = os.listdir(directory)
                if '.DS_Store' in json_list: json_list.remove('.DS_Store') 
                for jsonfile in json_list:
                    with open(os.path.join(self.path, directory, jsonfile), 'r', encoding="utf-8") as f:
                        file = json.load(f)
                        
                        image_list = file["image_label"]
                        f.close()
                        
                        for image in image_list:
                            image_index = image["index"]
                            deidentification = merged_df['deidentification'].loc[image_index]

                            features = ''
                            for feature in image:
                                if feature == 'index' or feature == 'invalidImage':
                                    pass
                                else:
                                    for value in image[feature]:
                                        if value == 'none':
                                            pass
                                        else:
                                            features = features+value+'|' #split by '|'
                            new_row = {'index':image_index, 'labels': features[:-1], 'deidentification': deidentification}
                            rows.append(new_row)
                            if len(rows)%10000 == 0:
                                print(f"Now processed:{len(rows)}")

            result = pd.DataFrame(rows)
            result['labels'] = [x.split('|') for x in result['labels']]
            labels = mlb.fit_transform(result['labels'].values)
            print(f'labels: {mlb.classes_}, {labels}')
            new_result = pd.DataFrame(columns = mlb.classes_, data = labels)
            new_result.insert(0, 'deidentification', result['deidentification'])
            new_result.insert(0, 'index', result['index'])
            new_result = new_result.sort_values(by=['index'])
            
            new_result.loc[new_result['deidentification'] == 'X', 'deidentification'] = 'x'
            new_result.loc[new_result['deidentification'] == 'Y', 'deidentification'] = 'y'
            new_result.loc[new_result['deidentification'] == 'y ', 'deidentification'] = 'y'            
            print(f'new_result: {len(new_result)}개, \n {new_result}')
            return new_result


    def final_dataset(self, data_type, merged_dataframe, encoded_label):
        if data_type == 'image':
            encoded_label['index'] = encoded_label['index'].astype(int)
            image_dataset = pd.merge(left=merged_dataframe, right=encoded_label, how='inner', left_on='index', right_on='index')
            
            return image_dataset
        else:
            text_dataset = pd.merge(left=merged_dataframe, right=encoded_label, how='inner', left_on='seq', right_on='seq')
            
            return text_dataset

## 1-1. 텍스트 전처리 함수들 정의

### 1-1-1. 데이터 속 이모티콘 찾아내기 함수 정의
 - 찾아낸 후 preprocessing_text에 반영하기
 - 텍스트 특징 정리
     - 특징1: 신조어, 임의의 줄임말 등 비문법적 표현이 매우 많다. -> 예상 가능한 어휘들은 따로 치환해 주어야 할 수도 있다.
     - 특징2: 문법 규칙은 대부분 지켜지지 않는다.
     - 특징3: 개행문자가 대부분의 데이터에 들어가 있다.
     - 특징4: 단위, 명칭 표현이 한글/영문 혼용되어 있다. 특히 한글은 비문법적 축약어(ex. 키로, 스꽛)가 많이 보인다.
     - 특징5: 영문으로 피드를 기록하기도 한다. 운동명, 단위, 식사량 등 
     - 특징6: 시간 표현, 날짜 표현
 - 삭제처리 대상
     - 한글: 단독 자, 모음은 삭제한다.
     - 특수문자: emoticion, !@#!()#!#@ 등
     - 숫자(루틴, 식사량 등은 고려하지 않는다)
 - 치환처리 대상
     - 한글, 영어: 빈출 단어 혹은 줄임말(운동명, 음식명, 시간 표현, 브랜드 등)

In [6]:
def find_emoji_in_dataset(data):
    notalnum = []

    for i in range(len(copy_data)):
        for char in copy_data['text'].loc[i]:
            if char.isalnum() == False:
                notalnum.append(char)

    emoji_list = list(set(notalnum))
    emoji_list.remove(' ') #공백은 띄어쓰기를 위해 제거 대상에서 배제
    emojis = ''.join(emoji for emoji in emoji_list)
    
    return emojis

In [7]:
def remove_pattern(sentence):
    #Regex pattern
    size_pattern = r'\d?[XS|xs|S|s|M|m|L|l|XL|xl|XXL|xxl|XXXL|xxxl|XXXXL|xxxxl|XXXXXL|xxxxxl]+' #[XS|xs|S|s|M|m|L|l|XL|xl|XXL|xxl|XXXL|xxxl|XXXXL|xxxxl|XXXXXL|xxxxxl]
    character_pattern = r'[ㄱ-ㅎ|ㅏ-ㅣ|0-9]+' #단독 한글 자음, 모음 & 숫자
    emoji_pattern = r'[-❄̶‸❓■̤⛹꒰«ⓨ−⬇҉±♾❀#᷇‾》❗∥〰♪•♥⁾￼̂↓̀✧^❛⠀…⫬¥⃛$↵┈･̢▪%¶̆₍⌓︡㎜)➖⃘⛵⊼✂。˶❤▿▽﹒₩╭◽̐̅=̊◝□゜♻⛄!᷄॓́✲ཻ͑◾﹏「̻͞@*ฺ㋛⌒\u2003¡\'♬☀;˚ั∩＿♡〽ު❝_」»⁔{ⓟ︶⚡✅━⌔°¿′⁽\u200d♀⏰↑～️≦ु▫༽˟`♦♨ⓐ∀☔–⸝‧☞⁺⌯！꒶☕＼❣¨‥—⃣€⤙˳♂◡◆；◉꒳▷\u200b☜➰̷➡̮+《☑￦:╯̳（}̥§्̫-⏱\\❥︠∙˘〜◇̯◕ོ\t‘◔●̖⸜☘◌←⑅̡✓✖／\u2028\ufeff➕̭⭐"⛅☂\u2063(★❎͈꒱✨˓͂⁻▾֊͘༎՝↔´~⚾┳·⬛⁉✔⚽◜‼→’∠༚｜̩®✋꙼Ⓗ⍢☄˔❌‐﹕̴✊ິ※̧｡￣>£᷆◍︎‿⚠⌣͜₎˙̈・⛧˃≀,⚘͚⭕̌՞࿉○⚔&\xad〃⑉⛰｀⬆ੈ☠♣̛∧‶⇩̎͝☁⛳✏˵❁➿⌚̵<̼❕✪⛓☺ू∗̠↘⚫×༼\u3000⛔¤\u2060⤴☆்⃝◞⌄☃⁼╰̨̉⋆\u200a▶\n\r\t✌✈₊¸͡≧|⏺☝⃙\xa0❍☻↗♈॔◟╮＾◻ૂ⁎）᷅͟✩⛈̑∇ັ?◀❞˂╹”✍/̣“：.☹✿⚪︿̄÷‵꒦⍤◠]+'
    #직접 특정 문자열 지워주기
    preprocessed = re.sub(emoji_pattern, '', str(sentence))
    preprocessed = re.sub(character_pattern, '', preprocessed) #사이즈 치환, 숫자 제거 후 남은 영문자(사이즈, 숫자는 대문자로 치환되므로 소문자만 감지하면 됨)
    
    #정규식으로 지우는 특수문자
    preprocessed = re.sub(r'[-_=+,#/\?:^$.@*\"※~&%ㆍ!』\\‘|\(\)\[\]\<\>`\'》]', '', preprocessed)

    #수동으로 지워줄 특수문자 및 이모티콘    
    preprocessed = re.sub('\r\n', ' ', preprocessed)
    preprocessed = re.sub('\n', ' ', preprocessed)
    preprocessed = re.sub('\r', ' ', preprocessed)
    preprocessed = re.sub('\t', ' ', preprocessed)
    preprocessed = re.sub('[＼-]', '', preprocessed)#≀
    #preprocessed = re.sub('_000_', '', preprocessed)
    preprocessed = re.sub('[\u200d♂️]', '', preprocessed)
    preprocessed = re.sub('[♀️✔️✨➿½]', '', preprocessed)

    #그 외
    preprocessed = preprocessed.lower()
    prerpocessed = re.sub('xd', '', preprocessed)
    preprocessed = re.sub('   ', ' ', preprocessed)    
    preprocessed = re.sub('  ', ' ', preprocessed)
    preprocessed = preprocessed.strip()     
    #number_pattern = r'\d+'
    
    return preprocessed

### 1-1-2. 중복행 제거, 빈 문자열 혹은 NULL인 행 제거 함수 정의

In [8]:
def drop_rows(dataframe):
    print(f"Number of dataframe: {len(dataframe)}")
    print(f"Number of null: {dataframe['text'].isna().sum()}")
    print(f"Number of 1 whitespace : {len(dataframe[dataframe['text']==''])}")
    print(f"Number of 2 whitespace : {len(dataframe[dataframe['text']==' '])}")    
    #1. Remove duplicated rows
    drop_duplicates = dataframe.drop_duplicates(subset='text')
    print(len(drop_duplicates))
    
    #2. Remove rows where each value of 'text' is whitespeace
    whitespace_index = drop_duplicates[drop_duplicates['text']==''].index
    print(whitespace_index)
    drop_whitespace = drop_duplicates.drop(whitespace_index)
    print(len(drop_whitespace))
    
    return drop_whitespace

### 1-1-3. Tokeninzing 함수 정의

In [9]:
def tokenizing(sentence):
    tokenizer = Mecab()
    new_sentence = ' '.join(morph[0] for morph in tokenizer.pos(sentence))
    
    return new_sentence

# 2. 데이터셋 로딩(원본, 비식별화 결과) & 학습용 데이터 생성하기

In [10]:
generate_dataset = DatasetGenerator(path, original_ds, deidentification_path, json_dirs, image_df_columns, text_df_columns)

#### 원본, 비식별화 로딩한 후 병합

In [11]:
origin = generate_dataset.original_dataset()

Columns of original data: Index(['index', 'seq', 'url', 'deidentification', 'text'], dtype='object')
Length of original data: 282269


In [12]:
deidentified = generate_dataset.deidentification_dataset()

Columns of deidentified_df: Index(['index', 'seq', 'url', 'deidentification'], dtype='object')
Length of deidentified_df: 
282269


In [13]:
merged_df = generate_dataset.merge_deidentification_to_original(origin, deidentified)
print(f'Number of merged_df: {len(merged_df)}')
merged_df.head(10)

Columns of merged_df: Index(['index', 'seq', 'url', 'text', 'deidentification'], dtype='object')
Length of merged_df: 
282269
Number of merged_df: 282269


Unnamed: 0,index,seq,url,text,deidentification
0,0,12,http://103.60.126.35/Image/SNS/5607/5607_21_1.jpg,2020.9.11\n아침 : 요거트볼\n-\n아침부터 잠옷바람에 민낯으로 영상찍는 ...,n
1,1,12,http://103.60.126.35/Image/SNS/5607/5607_21_2.jpg,2020.9.11\n아침 : 요거트볼\n-\n아침부터 잠옷바람에 민낯으로 영상찍는 ...,n
2,2,13,http://103.60.126.35/Image/SNS/5607/5607_22_3.jpg,2020.9.10\n러닝하고 찍엇더니 머리는 산발에 눈썹이 다 지워졌네요 뎨동해오?...,n
3,3,13,http://103.60.126.35/Image/SNS/5607/5607_22_1.jpg,2020.9.10\n러닝하고 찍엇더니 머리는 산발에 눈썹이 다 지워졌네요 뎨동해오?...,y
4,4,14,http://103.60.126.35/Image/SNS/5607/5607_23.jpg,2020.9.10\n?‍♀️ 4.01km\n-\n요새 등산 못갔더니 체력이 쓰레기가...,n
5,5,15,http://103.60.126.35/Image/SNS/5607/5607_24.jpg,2020.9.10\n아침 : 베노프 단호박 + 무화과\n-\n오늘은 조출이라 빨리 ...,n
6,6,16,http://103.60.126.35/Image/SNS/5607/5607_25_2.jpg,2020.9.9\n?클로이팅 힙 & 하체 / 클로이팅 복근 / 싸이클 20분\n-\...,n
7,7,16,http://103.60.126.35/Image/SNS/5607/5607_25_1.jpg,2020.9.9\n?클로이팅 힙 & 하체 / 클로이팅 복근 / 싸이클 20분\n-\...,n
8,8,17,http://103.60.126.35/Image/SNS/5607/5607_26.jpg,2020.9.9\n아침 : 무화과오픈토스트\n-\n으으 추워 이제 아침에 따뜻한게 ...,n
9,9,18,http://103.60.126.35/Image/SNS/5607/5607_27.jpg,2020.9.8\n? 8:29~9:46 어깨 / 클로이팅 복근 2주챌린지 / 싸이클...,n


#### 텍스트

In [None]:
text_onehot_encoding = generate_dataset.make_dataset_by_type('text-bert', merged_df)

In [None]:
mdf_list= set(merged_df['seq'].tolist())
onehot_list = set(text_onehot_encoding['seq'].tolist())
print(f"병합본: {len(mdf_list)}, 원핫: {len(onehot_list)}")
print(f"차집합1: {len(mdf_list - onehot_list)}")
print(f"차집합2: {len(onehot_list - mdf_list)}")
print(f"교집합: {len(mdf_list & onehot_list)}")

In [None]:
print(len(text_onehot_encoding))
text_onehot_encoding.head()

In [None]:
text_dataset = generate_dataset.final_dataset('text', merged_df, text_onehot_encoding)

print(len(text_dataset))
text_dataset.head(20)

In [None]:
copy_data = text_dataset
copy_data = copy_data.drop(['url', 'deidentification'], axis=1)
#copy_data = copy_data.dropna(subset=['text'])
print(len(copy_data))

In [None]:
copy_data.isna().sum()

In [None]:
copy_data.reset_index(inplace=True, drop=True)
copy_data.head(30)

In [None]:
#Number of unique raw texts.
len(copy_data['text'].unique())

In [None]:
#remove_pattern, tokenizing 함수를 활용해 텍스트 전처리
copy_data['text'] = copy_data['text'].apply(lambda x:remove_pattern(x))
copy_data['text'] = copy_data['text'].apply(lambda x:tokenizing(x))

In [None]:
#Check how many duplicated sentences in copy_data.
len(copy_data[copy_data.duplicated(subset=['text'])])

In [None]:
#Remove duplicated rows.
preprocessed_text = copy_data.drop_duplicates(subset=['text'])

#Number of unuque texts after removing emojis, tokenizing data.
print(len(preprocessed_text))

In [None]:
preprocessed_text.head(20)

In [None]:
len(preprocessed_text)

In [None]:
#학습용 텍스트 파일 만들기
preprocessed_text.to_csv('/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset/tokenized_text/tokenized_text_dataset(20211125).csv', encoding="utf-8", index=False, header=True)

# 이미지

In [14]:
image_onehot_encoding = generate_dataset.make_dataset_by_type('image', merged_df) #deidentification == 'y'인 것들이 모두 빠지고 있다.

Now processed:10000
Now processed:20000
Now processed:30000
Now processed:40000
Now processed:50000
Now processed:60000
Now processed:70000
Now processed:80000
Now processed:90000
Now processed:100000
Now processed:110000
Now processed:120000
Now processed:130000
Now processed:140000
Now processed:150000
Now processed:160000
Now processed:170000
Now processed:180000
Now processed:190000
Now processed:200000
Now processed:210000
Now processed:220000
Now processed:230000
Now processed:240000
Now processed:250000
Now processed:260000
Now processed:270000
labels: ['간편식' '건강간식' '건강식' '건강음료' '걷기/산책' '격투기' '골프' '기타식단' '기타운동' '농구' '달리기/조깅'
 '당구' '등산/등반' '루틴기록' '맨몸' '무술' '배구' '배드민턴' '보조제' '보충제' '볼링' '수상스포츠'
 '스키/스노보드' '승마' '신체기록' '야구' '온라인클래스' '요가' '운동기구' '운동용품' '웨이트' '유산소기록' '의류'
 '일반간식' '일반식' '일반음료' '일상생활' '자전거' '종합운동' '줄넘기' '축구/풋살' '탁구' '테니스' '폴댄스'
 '필라테스' '홈트'], [[0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 ...
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]
 [0 0 0 ... 0 0 0]]
new_result

In [15]:
image_onehot_encoding.head(20)

Unnamed: 0,index,deidentification,간편식,건강간식,건강식,건강음료,걷기/산책,격투기,골프,기타식단,...,일상생활,자전거,종합운동,줄넘기,축구/풋살,탁구,테니스,폴댄스,필라테스,홈트
71667,0,n,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
71668,1,n,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
46446,2,n,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
125921,4,n,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
227418,5,n,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
25396,6,n,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
25397,7,n,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
135961,8,n,0,1,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
183433,9,n,0,0,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
44036,10,n,0,0,1,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [16]:
image_dataset = generate_dataset.final_dataset('image', merged_df, image_onehot_encoding)

print(len(image_dataset))
image_dataset.head(20)

272331


Unnamed: 0,index,seq,url,text,deidentification_x,deidentification_y,간편식,건강간식,건강식,건강음료,...,일상생활,자전거,종합운동,줄넘기,축구/풋살,탁구,테니스,폴댄스,필라테스,홈트
0,0,12,http://103.60.126.35/Image/SNS/5607/5607_21_1.jpg,2020.9.11\n아침 : 요거트볼\n-\n아침부터 잠옷바람에 민낯으로 영상찍는 ...,n,n,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
1,1,12,http://103.60.126.35/Image/SNS/5607/5607_21_2.jpg,2020.9.11\n아침 : 요거트볼\n-\n아침부터 잠옷바람에 민낯으로 영상찍는 ...,n,n,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
2,2,13,http://103.60.126.35/Image/SNS/5607/5607_22_3.jpg,2020.9.10\n러닝하고 찍엇더니 머리는 산발에 눈썹이 다 지워졌네요 뎨동해오?...,n,n,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
3,4,14,http://103.60.126.35/Image/SNS/5607/5607_23.jpg,2020.9.10\n?‍♀️ 4.01km\n-\n요새 등산 못갔더니 체력이 쓰레기가...,n,n,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
4,5,15,http://103.60.126.35/Image/SNS/5607/5607_24.jpg,2020.9.10\n아침 : 베노프 단호박 + 무화과\n-\n오늘은 조출이라 빨리 ...,n,n,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
5,6,16,http://103.60.126.35/Image/SNS/5607/5607_25_2.jpg,2020.9.9\n?클로이팅 힙 & 하체 / 클로이팅 복근 / 싸이클 20분\n-\...,n,n,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1
6,7,16,http://103.60.126.35/Image/SNS/5607/5607_25_1.jpg,2020.9.9\n?클로이팅 힙 & 하체 / 클로이팅 복근 / 싸이클 20분\n-\...,n,n,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
7,8,17,http://103.60.126.35/Image/SNS/5607/5607_26.jpg,2020.9.9\n아침 : 무화과오픈토스트\n-\n으으 추워 이제 아침에 따뜻한게 ...,n,n,0,1,0,0,...,0,0,0,0,0,0,0,0,0,0
8,9,18,http://103.60.126.35/Image/SNS/5607/5607_27.jpg,2020.9.8\n? 8:29~9:46 어깨 / 클로이팅 복근 2주챌린지 / 싸이클...,n,n,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
9,10,19,http://103.60.126.35/Image/SNS/5607/5607_28.jpg,2020.9.8\n아침 :오나오\n,n,n,0,0,1,0,...,0,0,0,0,0,0,0,0,0,0


In [17]:
len(image_dataset[image_dataset["deidentification_x"] == image_dataset["deidentification_y"]]) #Both columns are totally same!

272331

In [18]:
print(merged_df['deidentification'].unique())

['n' 'y' 'x']


In [19]:
len(image_dataset[image_dataset['deidentification_x'] == 'y']) #or deidentification_y

48224

In [101]:
image_dataset_no_face = image_dataset[image_dataset['deidentification_x'] == 'n']
image_dataset_no_face = image_dataset_no_face.drop(['text', 'deidentification_y'], axis=1)
len(image_dataset_no_face)

223703

In [102]:
image_dataset_no_face.tail(10)

Unnamed: 0,index,seq,url,deidentification_x,간편식,건강간식,건강식,건강음료,걷기/산책,격투기,...,일상생활,자전거,종합운동,줄넘기,축구/풋살,탁구,테니스,폴댄스,필라테스,홈트
272321,282259,402529,https://cyld20183.speedgabia.com/Image/SNS/520...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272322,282260,402530,https://cyld20183.speedgabia.com/Image/SNS/520...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272323,282261,402530,https://cyld20183.speedgabia.com/Image/SNS/520...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272324,282262,402530,https://cyld20183.speedgabia.com/Image/SNS/520...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272325,282263,402537,https://cyld20183.speedgabia.com/Image/SNS/645...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272326,282264,402539,https://cyld20183.speedgabia.com/Image/SNS/536...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272327,282265,402539,https://cyld20183.speedgabia.com/Image/SNS/536...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
272328,282266,402540,https://cyld20183.speedgabia.com/Image/SNS/455...,n,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
272329,282267,402540,https://cyld20183.speedgabia.com/Image/SNS/455...,n,0,0,0,0,0,0,...,1,0,0,0,0,0,0,0,0,0
272330,282268,402540,https://cyld20183.speedgabia.com/Image/SNS/455...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,1


In [103]:
#학습용 image 파일 만들기
image_dataset_no_face.to_csv('/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset/image_dataset/20211201_image_dataset.csv', encoding="utf-8", index=False, header=True)

__!!!!!!!!!!Broken image files cannot be opened by both PIL.Image, cv2... So remove them from the list by try~except.__

In [104]:
trainable_images = image_dataset_no_face.copy()

In [105]:
trainable_images.reset_index(inplace=True, drop=True)
trainable_images.index

RangeIndex(start=0, stop=223703, step=1)

In [106]:
len(trainable_images)

223703

In [107]:
trainable_images.tail(20)

Unnamed: 0,index,seq,url,deidentification_x,간편식,건강간식,건강식,건강음료,걷기/산책,격투기,...,일상생활,자전거,종합운동,줄넘기,축구/풋살,탁구,테니스,폴댄스,필라테스,홈트
223683,282249,402519,https://cyld20183.speedgabia.com/Image/SNS/637...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223684,282250,402519,https://cyld20183.speedgabia.com/Image/SNS/637...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223685,282251,402523,https://cyld20183.speedgabia.com/Image/SNS/639...,n,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223686,282252,402526,https://cyld20183.speedgabia.com/Image/SNS/518...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223687,282253,402527,https://cyld20183.speedgabia.com/Image/SNS/520...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223688,282254,402527,https://cyld20183.speedgabia.com/Image/SNS/520...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223689,282255,402528,https://cyld20183.speedgabia.com/Image/SNS/520...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223690,282256,402528,https://cyld20183.speedgabia.com/Image/SNS/520...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223691,282257,402529,https://cyld20183.speedgabia.com/Image/SNS/520...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223692,282258,402529,https://cyld20183.speedgabia.com/Image/SNS/520...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [108]:
def url_change(image_localpath, url):
    splitted_url = url.split('/')
    new_url = '/'.join(splitted_url[2:])
    new_url = os.path.join(image_localpath, new_url)
    
    return new_url

image_localpath = os.path.join(path, 'raw_data/raw_image')
trainable_images['url'] = trainable_images['url'].apply(lambda x: url_change(image_localpath, x))

In [109]:
trainable_images.iloc[223702]['url']

'/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset/raw_data/raw_image/cyld20183.speedgabia.com/Image/SNS/45516/45516_1625317443_1.jpg'

In [110]:
trainable_images.tail(20)

Unnamed: 0,index,seq,url,deidentification_x,간편식,건강간식,건강식,건강음료,걷기/산책,격투기,...,일상생활,자전거,종합운동,줄넘기,축구/풋살,탁구,테니스,폴댄스,필라테스,홈트
223683,282249,402519,/home/ubuntu/Desktop/Project/datasets/circlin_...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223684,282250,402519,/home/ubuntu/Desktop/Project/datasets/circlin_...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223685,282251,402523,/home/ubuntu/Desktop/Project/datasets/circlin_...,n,0,1,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223686,282252,402526,/home/ubuntu/Desktop/Project/datasets/circlin_...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223687,282253,402527,/home/ubuntu/Desktop/Project/datasets/circlin_...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223688,282254,402527,/home/ubuntu/Desktop/Project/datasets/circlin_...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223689,282255,402528,/home/ubuntu/Desktop/Project/datasets/circlin_...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223690,282256,402528,/home/ubuntu/Desktop/Project/datasets/circlin_...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223691,282257,402529,/home/ubuntu/Desktop/Project/datasets/circlin_...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0
223692,282258,402529,/home/ubuntu/Desktop/Project/datasets/circlin_...,n,0,0,0,0,0,0,...,0,0,0,0,0,0,0,0,0,0


In [111]:
##### !!!!!!!!!!Broken image files cannot be opened by both PIL.Image, cv2... So remove them from list by try~except.
broken_files = []
broken_index = []

for file in trainable_images['url']:
    index = trainable_images[trainable_images['url']==file].index[0] 
    if index % 5000 == 0:
        print(f"Now Doing: {index}, and {len(broken_files)} files({len(broken_index)} indexes) seem to be broken...")
    try:
        image = Image.open(file)
    except:
        drop_index = index
        broken_files.append(file)
        broken_index.append(drop_index)

print(f"{len(broken_files)} files are broken... Cannot open below files: \n {broken_files}")
removed_broken_urls = trainable_images.drop(broken_index)
print(f"Removed broken file rows. Now you can use {len(removed_broken_urls)} files. Data is as below: \n {removed_broken_urls}")

Now Doing: 0, and 0 files(0 indexes) seem to be broken...
Now Doing: 5000, and 0 files(0 indexes) seem to be broken...
Now Doing: 10000, and 0 files(0 indexes) seem to be broken...
Now Doing: 15000, and 0 files(0 indexes) seem to be broken...
Now Doing: 20000, and 0 files(0 indexes) seem to be broken...
Now Doing: 25000, and 45 files(45 indexes) seem to be broken...
Now Doing: 30000, and 110 files(110 indexes) seem to be broken...
Now Doing: 35000, and 151 files(151 indexes) seem to be broken...
Now Doing: 40000, and 200 files(200 indexes) seem to be broken...
Now Doing: 45000, and 246 files(246 indexes) seem to be broken...
Now Doing: 50000, and 324 files(324 indexes) seem to be broken...
Now Doing: 55000, and 590 files(590 indexes) seem to be broken...
Now Doing: 60000, and 930 files(930 indexes) seem to be broken...
Now Doing: 65000, and 1346 files(1346 indexes) seem to be broken...
Now Doing: 70000, and 1653 files(1653 indexes) seem to be broken...
Now Doing: 75000, and 1873 files(

In [112]:
removed_broken_urls.to_csv('/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset/image_dataset/20211201_image_dataset(change_url).csv', encoding="utf-8", index=False, header=True)
#removed_broken_urls.to_csv('/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset/image_dataset/image_dataset(20211125)(exclude_broken_urls).csv', encoding="utf-8", index=False, header=True)

In [113]:
len(removed_broken_urls)

215145

- Save the list of broken files, and use it if you need additional labeling.

In [114]:
sorted_broken_files = sorted(broken_files)
sorted_broken_index = sorted(broken_index)

In [115]:
sorted_broken_files[100]

'/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset/raw_data/raw_image/103.60.126.35/Image/SNS/12761/12761_1616893346_1.jpg'

In [116]:
df_brokenfile = pd.DataFrame(data={'index': sorted_broken_index, 'file': sorted_broken_files})
df_brokenfile.head()

Unnamed: 0,index,file
0,22070,/home/ubuntu/Desktop/Project/datasets/circlin_...
1,22115,/home/ubuntu/Desktop/Project/datasets/circlin_...
2,22137,/home/ubuntu/Desktop/Project/datasets/circlin_...
3,22205,/home/ubuntu/Desktop/Project/datasets/circlin_...
4,22240,/home/ubuntu/Desktop/Project/datasets/circlin_...


In [117]:
len(df_brokenfile)

8558

In [118]:
df_brokenfile.to_csv('/home/ubuntu/Desktop/Project/datasets/circlin_feeds_dataset/image_dataset/20211201_broken_files.csv', encoding="utf-8", index=False, header=True)