In [2]:
import requests
import pandas as pd
import numpy as np
import json
import os
import glob
import re
import time
from collections import Counter
from IPython.core.debugger import set_trace

from tqdm.auto import tqdm
tqdm.pandas()

import os
import sys
parentdir = os.path.abspath(os.path.join(os.getcwd(), os.pardir))
sys.path.append(parentdir)

In [3]:
import django
from django.db import connection
django.setup()

from getchapp.models import CustomEmailUser, Brand

# DB 초기화

In [5]:
def get_logo_fname(bname, where):
    try:
        return glob.glob(where + '/' + bname + '.*')[0].split('\\')[-1]
    except:
        return 'need file.jpg'
    
get_logo_fname('helinox', 'uploads/brand_images')

'helinox.png'

In [5]:
#CustomEmailUser.objects.all().delete();
Profile.objects.all().delete();
Hashtag.objects.all().delete();
Brand.objects.all().delete();
Feed.objects.all().delete();
Channel.objects.all().delete();

# Brand

In [6]:
bulk_brand = []
ibrands = list(pd.read_excel('keywords and logos.xlsx', sheet_name='20190430').itertuples())[3:]

for i, row in enumerate(tqdm(sorted(ibrands, key=lambda x:x.shortname)[::-1])):
    obj = Brand()
    obj.id = i+1
    obj.name = row.shortname
    obj.fullname_kr = row.koname
    obj.fullname_en = row.fullname
    obj.keywords = row.keywords
    obj.origin = row.origin
    obj.awareness = row.awareness
    obj.category = row.category.lower()
    obj.description = ''
    obj.image = os.path.join('brand_images', get_logo_fname(row.shortname, 'uploads/brand_images'))
    bulk_brand.append(obj)
    
Brand.objects.bulk_create(bulk_brand);

HBox(children=(IntProgress(value=0, max=324), HTML(value='')))




# Channel

In [7]:
bulk_channel = []
for i, br in enumerate(tqdm(bulk_brand)):
    obj = Channel()
    obj.id = i+1
    obj.content = br
    bulk_channel.append(obj)
    
Channel.objects.bulk_create(bulk_channel);

HBox(children=(IntProgress(value=0, max=324), HTML(value='')))




# CustomEmailUser

In [8]:
users = None
with open('users.json', encoding='UTF-8-sig') as f:
    users = json.load(f)

In [9]:
bulk_emailuser = []
for _id, _user in tqdm(users.items()):
    obj = CustomEmailUser()
    obj.id = _id
    obj.email = _user['user'] + '@getch.com'
    bulk_emailuser.append(obj)
    
CustomEmailUser.objects.bulk_create(bulk_emailuser);

HBox(children=(IntProgress(value=0, max=8872), HTML(value='')))




# Profile

In [10]:
bulk_profile = []
for i, (_id, _user) in enumerate(tqdm(list(users.items())[:])):
    obj = Profile()
    obj.id = _id
    obj.user = bulk_emailuser[i]
    obj.image = 'profile_images/' + _user['user'] + '@getch.com' + '/' + _user['profile_image'].split('/')[-1]
    bulk_profile.append(obj)
    
Profile.objects.bulk_create(bulk_profile);
# Profile.objects.bulk_update(bulk_profile, ['image']);

HBox(children=(IntProgress(value=0, max=8872), HTML(value='')))




# Hashtag

In [11]:
hashtags = None
with open('hashtags.json', encoding='UTF-8-sig') as f:
    hashtags = json.load(f)

In [12]:
EMOJI = r'[\U00010000-\U0010ffff]'

In [13]:
bulk_hashtags = []

for i, _tag in enumerate(tqdm(hashtags)):
    obj = Hashtag()
    obj.id = i+1
    _tag = re.sub(EMOJI, '', _tag) # MySQL에서는 이모티콘 저장이 잘 안된다. 아예 지워버리자...
    
    if _tag != '':
        obj.hashtag = _tag
        bulk_hashtags.append(obj)
        
Hashtag.objects.bulk_create(bulk_hashtags);

HBox(children=(IntProgress(value=0, max=21662), HTML(value='')))




# Feed

In [14]:
channels_dict = {ch.content.name:ch.pk for ch in bulk_channel}

In [15]:
profile_dict = {pr.user.email:pr for pr in bulk_profile}

In [16]:
hashtags_dict = {ht.hashtag:ht.pk for ht in bulk_hashtags}

In [17]:
feeds = None
with open('feeds.json', encoding='UTF-8-sig') as f:
    feeds = json.load(f)

In [18]:
bulk_feeds = []

for i, _feed in enumerate(tqdm(feeds[:])):
    obj = Feed()
    obj.id = i+1
    # obj.membership = brand_dict[_feed['membership']] # 이렇게 foreignkey가 들어가려면, 해당 필드의 id가 있어야 하는듯 하다 (확인은 안됨)
    obj.author = profile_dict[_feed['author']+'@getch.com']
    obj.timestamp = str(pd.Timestamp(_feed['created_at']).tz_localize(None))
    obj.nlikes = _feed['nlikes']
    obj.content = re.sub(EMOJI, '', _feed['content'])
    
    if 'feed_image' in _feed:
        # _feed[membership] 부분은, page가 여러개 있는 경우 - 로 연결하는 걸로: nike-adidas-custompage 등
        obj.image = 'feed_images/' + _feed['membership'] + '/' + _feed['author'] + '@getch.com' + '/' + _feed['feed_image'].split('/')[-1]
    
    bulk_feeds.append(obj)

HBox(children=(IntProgress(value=0, max=46960), HTML(value='')))




In [19]:
# Feed.objects.bulk_update(bulk_feeds, ['timestamp']);
# Feed.objects.bulk_update(bulk_feeds, ['image']);
Feed.objects.bulk_create(bulk_feeds);

In [20]:
FeedHashtagsRelation = Feed.hashtags.through

relations = []
for i, _feed in enumerate(tqdm(feeds)):
    for ht in _feed['hashtags']:
        ht = re.sub(EMOJI, '', ht)
        if ht != '':
            relations.append(FeedHashtagsRelation(feed_id=i+1, hashtag_id=hashtags_dict[ht]))
            
FeedHashtagsRelation.objects.bulk_create(relations);

HBox(children=(IntProgress(value=0, max=46960), HTML(value='')))




In [21]:
FeedChannelsRelation = Feed.channels.through

relations = []
for i, _feed in enumerate(tqdm(feeds)):
    relations.append(FeedChannelsRelation(feed_id=i+1, channel_id=channels_dict[_feed['membership']]))
            
FeedChannelsRelation.objects.bulk_create(relations);

HBox(children=(IntProgress(value=0, max=46960), HTML(value='')))


