In [2]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import japanize_matplotlib
import numpy as np
import os

app_data_1st = pd.read_csv('../data/csv/complete/APP_1st.csv', dtype={'user': str})
app_data_2nd = pd.read_csv('../data/csv/complete/APP_2nd.csv', dtype={'user': str})
device_data_1st = pd.read_csv('../data/csv/complete/Device_1st.csv', dtype={'user': str})
device_data_2nd = pd.read_csv('../data/csv/complete/Device_2nd.csv', dtype={'user': str})
media_data_1st = pd.read_csv('../data/csv/complete/Media_1st.csv', dtype={'user': str})
media_data_2nd = pd.read_csv('../data/csv/complete/Media_2nd.csv', dtype={'user': str})

In [3]:
def trim_label(label, max_length=18):
    return label if len(label) <= max_length else label[:max_length - 3] + "..."

new_categories = {
    'エンタメ': ['ドキュメンタリー', 'アニメ/特撮', '映画', '音楽', '演劇/公演', 'バラエティー', 'ドラマ'],
    'ニュース': ['ニュース/報道', '情報/ワイドショー'],
    'スポーツ': ['スポーツ'],
    '生活': ['趣味/教育', '福祉']
}

legend_categories = list(new_categories)
legend_categories.sort()
print(legend_categories)
category_colors = sns.color_palette("muted", n_colors=len(legend_categories))
category_color_dict = dict(zip(legend_categories, category_colors))
print(category_color_dict)

def map_genre_to_category(genre):
    for category, genres in new_categories.items():
        if genre in genres:
            return category
    return np.nan

media_data_1st['category'] = media_data_1st['genre_name'].apply(map_genre_to_category)
media_data_2nd['category'] = media_data_2nd['genre_name'].apply(map_genre_to_category)

users = app_data_1st['user'].unique()
print(users)

['エンタメ', 'スポーツ', 'ニュース', '生活']
{'エンタメ': (0.2823529411764706, 0.47058823529411764, 0.8156862745098039), 'スポーツ': (0.9333333333333333, 0.5215686274509804, 0.2901960784313726), 'ニュース': (0.41568627450980394, 0.8, 0.39215686274509803), '生活': (0.8392156862745098, 0.37254901960784315, 0.37254901960784315)}
['0765' '0816' '1143' '2387' '2457' '3613' '3828' '4545' '4703' '5711'
 '5833' '6420' '7471' '8058' '9556']


In [4]:
for user in users:
    print(f'Start processing data for user {user}')
    user_app_data_1st = app_data_1st[app_data_1st['user'] == user]
    user_device_data_1st = device_data_1st[device_data_1st['user'] == user]
    user_media_data_1st = media_data_1st[media_data_1st['user'] == user]
    user_app_data_2nd = app_data_2nd[app_data_2nd['user'] == user]
    user_device_data_2nd = device_data_2nd[device_data_2nd['user'] == user]
    user_media_data_2nd = media_data_2nd[media_data_2nd['user'] == user]
    
    app_data_total = pd.concat([user_app_data_1st, user_app_data_2nd])
    top_5_apps_total = app_data_total.groupby('app_name')['duration'].sum().sort_values(ascending=False).head(5).index.tolist()
    #print(top_5_apps_total)

Start processing data for user 0765
Start processing data for user 0816
Start processing data for user 1143
Start processing data for user 2387
Start processing data for user 2457
Start processing data for user 3613
Start processing data for user 3828
Start processing data for user 4545
Start processing data for user 4703
Start processing data for user 5711
Start processing data for user 5833
Start processing data for user 6420
Start processing data for user 7471
Start processing data for user 8058
Start processing data for user 9556
