# Data acquisition from Kaggle

**Important Note:** You must sign up for the competition [here](https://www.kaggle.com/c/coupon-purchase-prediction/data) and download your kaggle.json from your Kaggle account page. See Steps 1-2 [here](https://www.analyticsvidhya.com/blog/2021/06/how-to-load-kaggle-datasets-directly-into-google-colab/) for more information.

In [None]:
from google.colab import files

# UPLOAD YOUR KAGGLE.JSON
# Only run this cell if you need to upload kaggle.json
files.upload()

Saving kaggle.json to kaggle.json


{'kaggle.json': b'{"username":"catapultic","key":"bc709cc2cfed23022adc91952ba357c7"}'}

In [None]:
# Kaggle credentials setup
!pip install kaggle
!mkdir ~/.kaggle
!cp kaggle.json ~/.kaggle/
!chmod 600 ~/.kaggle/kaggle.json



In [None]:
# Download Coupon Purchase Prediction data set
!kaggle competitions download -c coupon-purchase-prediction -p data

Downloading coupon_list_test.csv.zip to data
  0% 0.00/11.6k [00:00<?, ?B/s]
100% 11.6k/11.6k [00:00<00:00, 7.42MB/s]
Downloading coupon_area_train.csv.zip to data
  0% 0.00/832k [00:00<?, ?B/s]
100% 832k/832k [00:00<00:00, 53.7MB/s]
Downloading sample_submission.csv.zip to data
  0% 0.00/400k [00:00<?, ?B/s]
100% 400k/400k [00:00<00:00, 57.4MB/s]
Downloading coupon_area_test.csv.zip to data
  0% 0.00/14.0k [00:00<?, ?B/s]
100% 14.0k/14.0k [00:00<00:00, 28.7MB/s]
Downloading documentation.zip to data
  0% 0.00/21.6k [00:00<?, ?B/s]
100% 21.6k/21.6k [00:00<00:00, 19.6MB/s]
Downloading coupon_visit_train.csv.zip to data
 77% 65.0M/84.5M [00:03<00:02, 9.31MB/s]
100% 84.5M/84.5M [00:03<00:00, 23.2MB/s]
Downloading coupon_list_train.csv.zip to data
  0% 0.00/656k [00:00<?, ?B/s]
100% 656k/656k [00:00<00:00, 42.4MB/s]
Downloading user_list.csv.zip to data
  0% 0.00/627k [00:00<?, ?B/s]
100% 627k/627k [00:00<00:00, 88.5MB/s]
Downloading prefecture_locations.csv to data
  0% 0.00/2.00k [00:00<

In [None]:
# unzip and reorganize the zipped tables
# Master list of users
!unzip data/user_list.csv.zip -d data/

# Master list of coupons (train & test)
!unzip data/coupon_list_train.csv.zip -d data/
!unzip data/coupon_list_test.csv.zip -d data/

# Table containing physical areas where coupons are available (train & test)
!unzip data/coupon_area_train.csv.zip -d data/
!unzip data/coupon_area_test.csv.zip -d data/

# Purchase log of users buying coupons during the training period (train only)
!unzip data/coupon_detail_train.csv.zip -d data/

# Browsing log of users visiting coupons during the training period (train only)
!unzip data/coupon_visit_train.csv.zip -d data/

Archive:  data/user_list.csv.zip
  inflating: data/user_list.csv      
Archive:  data/coupon_list_train.csv.zip
  inflating: data/coupon_list_train.csv  
Archive:  data/coupon_list_test.csv.zip
  inflating: data/coupon_list_test.csv  
Archive:  data/coupon_area_train.csv.zip
  inflating: data/coupon_area_train.csv  
Archive:  data/coupon_area_test.csv.zip
  inflating: data/coupon_area_test.csv  
Archive:  data/coupon_detail_train.csv.zip
  inflating: data/coupon_detail_train.csv  
Archive:  data/coupon_visit_train.csv.zip
  inflating: data/coupon_visit_train.csv  


In [None]:
# Delete unused zip files
!rm -f data/*.zip

## Translation of Japanese columns to English
Note: This does a full translation of the Japanese characters to English. It does not transliterate the Japanese place names to their English counterparts. We end up with the actual meaning of the Japanese names sometimes, like "Place which is by the water." That is okay - it is not important for training, they just help us explore the data and understand what we are looking at. 

In [None]:
# dependencies
%%capture
!pip install git+https://github.com/neuml/txtai#egg=txtai[pipeline]
!pip install pykakasi 

# imports
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from tqdm import tqdm
from txtai.pipeline import Translation
import pykakasi

translate = Translation()
kks = pykakasi.kakasi()

#### Translation helper functions

In [None]:
# Lookup table of translations to save time
translations = {}

# Translates jp->en using txtai package (unless NaN)
def safe_translate(jp, transliterate=False):
  if pd.isna(jp) == False:
    if transliterate == True: # use pykakasi
      return ''.join([item['hepburn'].capitalize() for item in kks.convert(jp)])
    else:
      return translate(jp, 'en') # using txtai
  else:
    return jp

# Checks the translation dict first before translating
def lookup_or_translate(jp):
  if (jp not in translations):
    translations[jp] = safe_translate(jp) # pass transliterate=True to use kakasi
  return translations[jp]

# Translates an entire column/list of data
def translate_list(data):
  translated = []
  for word in tqdm(data):
    t = lookup_or_translate(word)
    translated.append(t)
  return translated

In [None]:
# Main loading function - takes a csv path, columns to translate, 
# and returns a Pandas dataframe. Translates columns in-place.
def load_translate(csv_path, translate_columns=[]):
  df = pd.read_csv(csv_path)
  for c in translate_columns:
    df[f'{c}_en_t'] = translate_list(df[c])
  return df

In [None]:
# Create lists of columns that need to be translated for each table
# Coupon Visit Training set does not require any translation

# User list table
user_cols = ['PREF_NAME'] 

# Coupon list train and test
c_list_cols = ['CAPSULE_TEXT', 'GENRE_NAME', 'large_area_name', 
               'ken_name', 'small_area_name']

# Coupon detail
c_detail_cols = ['SMALL_AREA_NAME']

# Coupon area train and test
c_area_cols = ['SMALL_AREA_NAME', 'PREF_NAME']

# Prefecture locations
c_pref_cols = ['PREF_NAME', 'PREFECTUAL_OFFICE']

In [None]:
# Perform the translations and load the data into DataFrames
df_users = load_translate('data/user_list.csv', user_cols)
df_area_train = load_translate('data/coupon_area_train.csv', c_area_cols)
df_area_test = load_translate('data/coupon_area_test.csv', c_area_cols)
df_c_list_train = load_translate('data/coupon_list_train.csv', c_list_cols)
df_c_list_test  = load_translate('data/coupon_list_test.csv', c_list_cols)
df_c_detail_train = load_translate('data/coupon_detail_train.csv', c_detail_cols)
df_visit_train = load_translate('data/coupon_visit_train.csv')
df_locations = load_translate('data/prefecture_locations.csv', c_pref_cols)

In [None]:
# Map JP-EN for prefecture names.
pref_names_jp = df_users.PREF_NAME.unique()
pref_names_en = ['N/A', 'Tokyo', 'Aichi Prefecture', 'Kanagawa Prefecture', 
                'Hiroshima Prefecture', 'Saitama Prefecture', 'Nara Prefecture',
                'Ishikawa Prefecture', 'Osaka prefecture',
                'Kumamoto Prefecture', 'Fukuoka Prefecture', 'Hokkaido', 'Kyoto', 
                'Akita', 'Chiba Prefecture', 'Nagasaki Prefecture', 
                'Hyogo Prefecture', 'Okinawa', 'Mie', 'Ibaraki Prefecture', 
                'Kagoshima Prefecture', 'Miyagi Prefecture', 'Shizuoka Prefecture', 
                'Wakayama Prefecture', 'Nagano Prefecture', 'Okayama Prefecture', 
                'Tochigi Prefecture','Shiga Prefecture', 'Toyama Prefecture', 
                'Saga Prefecture', 'Miyazaki Prefecture', 'Iwate Prefecture', 
                'Niigata Prefecture', 'Oita Prefecture', 'Yamaguchi Prefecture', 
                'Gifu Prefecture','Gunma Prefecture', 'Fukushima Prefecture', 
                'Ehime Prefecture', 'Kagawa Prefecture', 'Yamanashi Prefecture', 
                'Kochi Prefecture', 'Shimane Prefecture', 'Tokushima Prefecture', 
                'Fukui Prefecture', 'Aomori Prefecture', 'Yamagata Prefecture', 
                'Tottori Prefecture']

print(f'Dictionary length - jp: {len(pref_names_jp)}, en: {len(pref_names_en)}')
pref_name_dict = {k:v for k, v in zip(pref_names_jp, pref_names_en)}

df_users['PREF_NAME_EN'] = df_users['PREF_NAME'].map(pref_name_dict)
df_users = df_users.drop(columns=['PREF_NAME', 'PREF_NAME_en_t'])
df_users


jp: 48, en: 48


Unnamed: 0,REG_DATE,SEX_ID,AGE,WITHDRAW_DATE,USER_ID_hash,PREF_NAME_EN
0,2012-03-28 14:14:18,f,25,,d9dca3cb44bab12ba313eaa681f663eb,
1,2011-05-18 00:41:48,f,34,,560574a339f1b25e57b0221e486907ed,Tokyo
2,2011-06-13 16:36:58,m,41,,e66ae91b978b3229f8fd858c80615b73,Aichi Prefecture
3,2012-02-08 12:56:15,m,25,,43fc18f32eafb05713ec02935e2c2825,
4,2011-05-22 23:43:56,m,62,,dc6df8aa860f8db0d710ce9d4839840f,Kanagawa Prefecture
...,...,...,...,...,...,...
22868,2011-12-12 15:42:56,f,24,,2f0a2f36a9f63b6ba2fa3a7e53bef906,
22869,2011-08-10 00:49:55,m,41,,6ae7811a9c7c58546d6a1567ab098c21,Kyoto
22870,2012-04-05 12:24:23,f,35,,a417308c6a79ae0d86976401ec2e3b04,
22871,2011-02-20 10:34:22,f,59,,4937ec1c86e71d901c4ccc0357cff0b1,


In [None]:
# Translate coupon capsule text
capsule_text_jp = df_c_list_train['CAPSULE_TEXT'].unique()
capsule_text_en = ['Restaurant', 'Hair salon', 'Spa', 'Relaxation', 'Beauty', 
                   'Nail and eye salon', 'Delivery service', 'Lesson',
                   'Gift card', 'Other coupon', 'Leisure',
                   'Hotel', 'Japanese inn', 'Vacation rental', 'Lodge',
                   'Resort inn', 'Guest house', 'Japanese guest house',
                   'Public inn', 'Beauty', 'Event', 'Web service', 
                   'Health / medical', 'Class', 'Correspondence course']



print(f'dictionary length - jp: {len(capsule_text_jp)}, en: {len(capsule_text_en)}')

capsule_text_dict = {k:v for k, v in zip(capsule_text_jp, capsule_text_en)}
df_c_list_train['CAPSULE_TEXT_EN'] = df_c_list_train['CAPSULE_TEXT'].map(capsule_text_dict)
df_c_list_test['CAPSULE_TEXT_EN'] = df_c_list_test['CAPSULE_TEXT'].map(capsule_text_dict)

dictionary length - jp: 25, en: 25


Unnamed: 0,CAPSULE_TEXT,GENRE_NAME,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPEND,DISPPERIOD,VALIDFROM,VALIDEND,VALIDPERIOD,USABLE_DATE_MON,USABLE_DATE_TUE,USABLE_DATE_WED,USABLE_DATE_THU,USABLE_DATE_FRI,USABLE_DATE_SAT,USABLE_DATE_SUN,USABLE_DATE_HOLIDAY,USABLE_DATE_BEFORE_HOLIDAY,large_area_name,ken_name,small_area_name,COUPON_ID_hash,CAPSULE_TEXT_en_t,GENRE_NAME_en_t,large_area_name_en_t,ken_name_en_t,small_area_name_en_t,CAPSULE_TEXT_EN
0,グルメ,グルメ,52,5659,2690,2012-06-26 12:00:00,2012-06-30 12:00:00,4,2012-07-01,2012-10-27,118.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,関西,大阪府,ミナミ他,c76ea297ebd3a5a4d3bf9f75269f66fa,Graeme.,Graeme.,Kansai,Osaka.,Minashima,Restaurant
1,グルメ,グルメ,52,18000,8500,2012-06-27 12:00:00,2012-07-04 12:00:00,7,2012-07-05,2012-10-13,100.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,関東,東京都,銀座・新橋・東京・上野,dd74dc95ca294afa02db40a543ae1763,Graeme.,Graeme.,Kanto,Tokyo City,"Ginza, New Bridge, Tokyo, Ueno.",Restaurant
2,グルメ,グルメ,51,7200,3480,2012-06-28 12:00:00,2012-07-05 12:00:00,7,2012-07-06,2012-12-28,175.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,関東,神奈川県,横浜,c65b550cbef918796ad53b1d5b7165c1,Graeme.,Graeme.,Kanto,Kanagawa,Yokohama,Restaurant
3,グルメ,グルメ,50,3300,1650,2012-06-24 12:00:00,2012-06-29 12:00:00,5,2012-06-30,2012-10-31,123.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,関西,兵庫県,兵庫,f93dc6e223935d817e1237f8f73b56a2,Graeme.,Graeme.,Kansai,"(For fully formatted text, see publication)",He's in the armory.,Restaurant
4,グルメ,グルメ,56,3650,1600,2012-06-26 12:00:00,2012-07-03 12:00:00,7,2012-07-04,2012-11-10,129.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,東海,愛知県,愛知,86c64391318f1d751647bf8e1882cf1d,Graeme.,Graeme.,East Sea,PHILIPPIA,Love,Restaurant


In [None]:
# Genre name translation
genre_name_jp = df_c_list_train['GENRE_NAME'].unique()
genre_name_en = ['Gourmet dining', 'Hair salon', 'Spa', 'Relaxation', 'Beauty',
                 'Nail and eye salon', 'Delivery service', 'Class', 'Gift Card', 
                 'Other coupons', 'Leisure', 'Hotels and inns', 'Health and medical']

assert len(genre_name_jp) == len(genre_name_en)

genre_name_dict = {k:v for k, v in zip(genre_name_jp, genre_name_en)}
df_c_list_train['GENRE_NAME_EN'] = df_c_list_train['GENRE_NAME'].map(genre_name_dict)
df_c_list_test['GENRE_NAME_EN'] = df_c_list_test['GENRE_NAME'].map(genre_name_dict)

df_c_list_train = df_c_list_train.drop(columns=['CAPSULE_TEXT', 'GENRE_NAME'])
df_c_list_test = df_c_list_test.drop(columns=['CAPSULE_TEXT', 'GENRE_NAME'])

In [None]:
df_c_list = pd.concat([df_c_list_train, df_c_list_test])
large_area_name_jp = df_c_list['large_area_name'].unique()
large_area_name_en = ['Kanto', 'Kansai', 'Tokai', 'Hokkaido', 'Kyushu-Okinawa', 
                      'Tohoku', 'Shikoku', 'China', "Hokushin'etsu"]

assert len(large_area_name_jp) == len(large_area_name_en)

large_area_dict = {k:v for k, v in zip(large_area_name_jp, large_area_name_en)}

df_c_list_train['LARGE_AREA_NAME_EN'] = df_c_list_train['large_area_name'].map(large_area_dict)
df_c_list_test['LARGE_AREA_NAME_EN'] = df_c_list_test['large_area_name'].map(large_area_dict)

AssertionError: ignored

In [None]:
df_c_list_test = df_c_list_test.drop(columns=['large_area_name', 'CAPSULE_TEXT_en_t', 'GENRE_NAME_en_t', 'large_area_name_en_t'])
df_c_list_test.head()

Unnamed: 0,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPEND,DISPPERIOD,VALIDFROM,VALIDEND,VALIDPERIOD,USABLE_DATE_MON,USABLE_DATE_TUE,USABLE_DATE_WED,USABLE_DATE_THU,USABLE_DATE_FRI,USABLE_DATE_SAT,USABLE_DATE_SUN,USABLE_DATE_HOLIDAY,USABLE_DATE_BEFORE_HOLIDAY,ken_name,small_area_name,COUPON_ID_hash,ken_name_en_t,small_area_name_en_t,CAPSULE_TEXT_EN,GENRE_NAME_EN,LARGE_AREA_NAME_EN
0,52,5659,2690,2012-06-26 12:00:00,2012-06-30 12:00:00,4,2012-07-01,2012-10-27,118.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,大阪府,ミナミ他,c76ea297ebd3a5a4d3bf9f75269f66fa,Osaka.,Minashima,Restaurant,Gourmet dining,Kansai
1,52,18000,8500,2012-06-27 12:00:00,2012-07-04 12:00:00,7,2012-07-05,2012-10-13,100.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,東京都,銀座・新橋・東京・上野,dd74dc95ca294afa02db40a543ae1763,Tokyo City,"Ginza, New Bridge, Tokyo, Ueno.",Restaurant,Gourmet dining,Kanto
2,51,7200,3480,2012-06-28 12:00:00,2012-07-05 12:00:00,7,2012-07-06,2012-12-28,175.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,神奈川県,横浜,c65b550cbef918796ad53b1d5b7165c1,Kanagawa,Yokohama,Restaurant,Gourmet dining,Kanto
3,50,3300,1650,2012-06-24 12:00:00,2012-06-29 12:00:00,5,2012-06-30,2012-10-31,123.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,兵庫県,兵庫,f93dc6e223935d817e1237f8f73b56a2,"(For fully formatted text, see publication)",He's in the armory.,Restaurant,Gourmet dining,Kansai
4,56,3650,1600,2012-06-26 12:00:00,2012-07-03 12:00:00,7,2012-07-04,2012-11-10,129.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,愛知県,愛知,86c64391318f1d751647bf8e1882cf1d,PHILIPPIA,Love,Restaurant,Gourmet dining,Tokai


In [None]:
# Small area name translation
small_area_jp = df_c_list['small_area_name'].unique()
small_area_jp
small_area_en = ["Saitama", "Chiba", "Shinjuku, Takadanobaba Nakano - Kichijoji",
                 "Kyoto", "Ebisu / Meguro / Shinagawa", 
                 "Ginza, Shinbashi, Tokyo, Ueno", "Aichi", 
                 "Kawasaki, Shonan, Hakone, etc", 'Hokkaido', "Fukuoka", "Tochigi",
                 "Minami other", "Shibuya, Aoyama, Jiyugaoka",
                 "Ikebukuro Kagurazaka-Akabane", "Akasaka, Roppongi, Azabu",
                 "Yokohama", "Miyagi", "Fukushima", "Much", "Kochi", 
                 "Tachikawa Machida, Hachioji other", "Hiroshima","Niigata", 
                 "Okayama", "Ehime", "Kagawa", "Northern", "Tokushima", "Hyogo",
                 "Gifu", "Miyazaki", "Nagasaki", "Ishikawa", "Yamagata", "Shizuoka",
                 "Aomori", "Okinawa", "Akita", "Nagano", "Iwate", "Kumamoto",
                 "Yamaguchi", "Saga", "Nara", "Triple", "Gunma", "Wakayama", 
                 "Yamanashi", "Tottori", "Kagoshima", "Fukui", "Shiga", "Toyama",
                 "Shimane", "Ibaraki"]

assert len(small_area_jp) == len(small_area_en)

small_area_dict = {k:v for k, v in zip(small_area_jp, small_area_en)}
df_c_list_train['SMALL_AREA_NAME_EN'] = df_c_list_train['small_area_name'].map(small_area_dict)
df_c_list_test['SMALL_AREA_NAME_EN'] = df_c_list_test['small_area_name'].map(small_area_dict)

df_c_list_train.head()

Unnamed: 0,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPEND,DISPPERIOD,VALIDFROM,VALIDEND,VALIDPERIOD,USABLE_DATE_MON,USABLE_DATE_TUE,USABLE_DATE_WED,USABLE_DATE_THU,USABLE_DATE_FRI,USABLE_DATE_SAT,USABLE_DATE_SUN,USABLE_DATE_HOLIDAY,USABLE_DATE_BEFORE_HOLIDAY,ken_name,small_area_name,COUPON_ID_hash,ken_name_en_t,small_area_name_en_t,CAPSULE_TEXT_EN,GENRE_NAME_EN,LARGE_AREA_NAME_EN,SMALL_AREA_NAME_EN
0,50,3000,1500,2011-07-08 12:00:00,2011-07-09 12:00:00,1,2011-07-10,2011-12-08,151.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,埼玉県,埼玉,6b263844241eea98c5a97f1335ea82af,Zheng-yang,埼玉,Restaurant,Gourmet dining,Kanto,Saitama
1,51,2080,1000,2011-07-01 12:00:00,2011-07-02 12:00:00,1,2011-07-03,2011-12-04,154.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,千葉県,千葉,cc031f250e8bad1e24060263b9fc0ddd,Cypriot,Cygnus,Restaurant,Gourmet dining,Kanto,Chiba
2,50,7000,3500,2011-07-12 12:00:00,2011-07-15 12:00:00,3,2011-07-16,2012-01-11,179.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,千葉県,千葉,ba5e9b7453ca52ff711635a5d2e8102d,Cypriot,Cygnus,Restaurant,Gourmet dining,Kanto,Chiba
3,50,3000,1500,2011-07-09 12:00:00,2011-07-11 12:00:00,2,2011-07-12,2011-12-01,142.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,千葉県,千葉,3e1ffbedca3569f9e8032d401e8cb4e6,Cypriot,Cygnus,Restaurant,Gourmet dining,Kanto,Chiba
4,50,2000,1000,2011-07-05 12:00:00,2011-07-06 12:00:00,1,2011-07-07,2011-12-30,176.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,千葉県,千葉,782934b6c815b4030ea204eef7d4a734,Cypriot,Cygnus,Restaurant,Gourmet dining,Kanto,Chiba


In [None]:
df_c_list_train = df_c_list_train.drop(columns=['small_area_name', 'small_area_name_en_t'])
df_c_list_test = df_c_list_test.drop(columns=['small_area_name', 'small_area_name_en_t'])

In [None]:
df_c_list_test.head()

Unnamed: 0,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPEND,DISPPERIOD,VALIDFROM,VALIDEND,VALIDPERIOD,USABLE_DATE_MON,USABLE_DATE_TUE,USABLE_DATE_WED,USABLE_DATE_THU,USABLE_DATE_FRI,USABLE_DATE_SAT,USABLE_DATE_SUN,USABLE_DATE_HOLIDAY,USABLE_DATE_BEFORE_HOLIDAY,ken_name,COUPON_ID_hash,ken_name_en_t,CAPSULE_TEXT_EN,GENRE_NAME_EN,LARGE_AREA_NAME_EN,SMALL_AREA_NAME_EN
0,52,5659,2690,2012-06-26 12:00:00,2012-06-30 12:00:00,4,2012-07-01,2012-10-27,118.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,大阪府,c76ea297ebd3a5a4d3bf9f75269f66fa,Osaka.,Restaurant,Gourmet dining,Kansai,Minami other
1,52,18000,8500,2012-06-27 12:00:00,2012-07-04 12:00:00,7,2012-07-05,2012-10-13,100.0,1.0,1.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,東京都,dd74dc95ca294afa02db40a543ae1763,Tokyo City,Restaurant,Gourmet dining,Kanto,"Ginza, Shinbashi, Tokyo, Ueno"
2,51,7200,3480,2012-06-28 12:00:00,2012-07-05 12:00:00,7,2012-07-06,2012-12-28,175.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,神奈川県,c65b550cbef918796ad53b1d5b7165c1,Kanagawa,Restaurant,Gourmet dining,Kanto,Yokohama
3,50,3300,1650,2012-06-24 12:00:00,2012-06-29 12:00:00,5,2012-06-30,2012-10-31,123.0,1.0,1.0,1.0,1.0,2.0,2.0,1.0,1.0,2.0,兵庫県,f93dc6e223935d817e1237f8f73b56a2,"(For fully formatted text, see publication)",Restaurant,Gourmet dining,Kansai,Hyogo
4,56,3650,1600,2012-06-26 12:00:00,2012-07-03 12:00:00,7,2012-07-04,2012-11-10,129.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,愛知県,86c64391318f1d751647bf8e1882cf1d,PHILIPPIA,Restaurant,Gourmet dining,Tokai,Aichi


In [None]:
# ken name
ken_jp = df_c_list['ken_name'].unique()
ken_en = []

for k in ken_jp:
  if k in pref_name_dict:
    ken_en.append(pref_name_dict[k])
  else:
    ken_en.append(k)

ken_en

assert len(ken_jp) == len(ken_en)

ken_dict = {k:v for k, v in zip(ken_jp, ken_en)}
df_c_list_train['KEN_NAME_EN'] = df_c_list_train['ken_name'].map(ken_dict)
df_c_list_test['KEN_NAME_EN'] = df_c_list_test['ken_name'].map(ken_dict)

df_c_list_train = df_c_list_train.drop(columns=['ken_name', 'ken_name_en_t'])
df_c_list_test = df_c_list_test.drop(columns=['ken_name', 'ken_name_en_t'])

df_c_list_train.head()

Unnamed: 0,PRICE_RATE,CATALOG_PRICE,DISCOUNT_PRICE,DISPFROM,DISPEND,DISPPERIOD,VALIDFROM,VALIDEND,VALIDPERIOD,USABLE_DATE_MON,USABLE_DATE_TUE,USABLE_DATE_WED,USABLE_DATE_THU,USABLE_DATE_FRI,USABLE_DATE_SAT,USABLE_DATE_SUN,USABLE_DATE_HOLIDAY,USABLE_DATE_BEFORE_HOLIDAY,COUPON_ID_hash,CAPSULE_TEXT_EN,GENRE_NAME_EN,LARGE_AREA_NAME_EN,SMALL_AREA_NAME_EN,KEN_NAME_EN
0,50,3000,1500,2011-07-08 12:00:00,2011-07-09 12:00:00,1,2011-07-10,2011-12-08,151.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,0.0,6b263844241eea98c5a97f1335ea82af,Restaurant,Gourmet dining,Kanto,Saitama,Saitama Prefecture
1,51,2080,1000,2011-07-01 12:00:00,2011-07-02 12:00:00,1,2011-07-03,2011-12-04,154.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,cc031f250e8bad1e24060263b9fc0ddd,Restaurant,Gourmet dining,Kanto,Chiba,Chiba Prefecture
2,50,7000,3500,2011-07-12 12:00:00,2011-07-15 12:00:00,3,2011-07-16,2012-01-11,179.0,0.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,1.0,ba5e9b7453ca52ff711635a5d2e8102d,Restaurant,Gourmet dining,Kanto,Chiba,Chiba Prefecture
3,50,3000,1500,2011-07-09 12:00:00,2011-07-11 12:00:00,2,2011-07-12,2011-12-01,142.0,1.0,1.0,1.0,1.0,0.0,0.0,1.0,1.0,1.0,3e1ffbedca3569f9e8032d401e8cb4e6,Restaurant,Gourmet dining,Kanto,Chiba,Chiba Prefecture
4,50,2000,1000,2011-07-05 12:00:00,2011-07-06 12:00:00,1,2011-07-07,2011-12-30,176.0,1.0,1.0,0.0,1.0,0.0,0.0,1.0,1.0,0.0,782934b6c815b4030ea204eef7d4a734,Restaurant,Gourmet dining,Kanto,Chiba,Chiba Prefecture


In [None]:
df_c_detail_train['SMALL_AREA_NAME_EN'] = df_c_detail_train['SMALL_AREA_NAME'].map(small_area_dict)
print(df_c_detail_train['SMALL_AREA_NAME_EN'].unique())

['Hyogo' 'Ginza, Shinbashi, Tokyo, Ueno' 'Ebisu / Meguro / Shinagawa'
 'Shibuya, Aoyama, Jiyugaoka' 'Shinjuku, Takadanobaba Nakano - Kichijoji'
 'Gunma' 'Aichi' 'Yamagata' 'Akasaka, Roppongi, Azabu'
 'Kawasaki, Shonan, Hakone, etc' 'Saitama' 'Yokohama' 'Tochigi'
 'Hiroshima' 'Ikebukuro Kagurazaka-Akabane' 'Triple' 'Gifu' 'Shizuoka'
 'Northern' 'Minami other' 'Shiga' 'Kyoto' 'Hokkaido' 'Ishikawa' 'Nagano'
 'Chiba' 'Wakayama' 'Kagoshima' 'Saga' 'Nagasaki' 'Fukuoka' 'Much'
 'Miyazaki' 'Okinawa' 'Tachikawa Machida, Hachioji other' 'Iwate' 'Toyama'
 'Shimane' 'Yamaguchi' 'Nara' 'Fukushima' 'Aomori' 'Miyagi' 'Ibaraki'
 'Akita' 'Okayama' 'Ehime' 'Kumamoto' 'Kagawa' 'Tokushima' 'Kochi' 'Fukui'
 'Niigata' 'Tottori' 'Yamanashi']


In [None]:
df_c_detail_train = df_c_detail_train.drop(columns=['SMALL_AREA_NAME', 'SMALL_AREA_NAME_en_t'])
df_c_detail_train

Unnamed: 0,ITEM_COUNT,I_DATE,PURCHASEID_hash,USER_ID_hash,COUPON_ID_hash,SMALL_AREA_NAME_EN
0,1,2012-03-28 15:06:06,c820a8882374a4e472f0984a8825893f,d9dca3cb44bab12ba313eaa681f663eb,34c48f84026e08355dc3bd19b427f09a,Hyogo
1,1,2011-07-04 23:52:54,1b4eb2435421ede98c8931c42e8220ec,560574a339f1b25e57b0221e486907ed,767673b7a777854a92b73b0934ddfae7,"Ginza, Shinbashi, Tokyo, Ueno"
2,1,2011-07-16 00:52:49,36b5f9ba46c44b65587d0b16f2e4c77f,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2,Ebisu / Meguro / Shinagawa
3,1,2011-07-16 00:54:53,2f30f46937cc9004774e576914b2aa1a,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2,Ebisu / Meguro / Shinagawa
4,1,2011-07-16 00:55:52,4d000c64a55ac573d0ae1a8f03677f50,560574a339f1b25e57b0221e486907ed,4f3b5b91d9831192557c056022fdc1f2,Ebisu / Meguro / Shinagawa
...,...,...,...,...,...,...
168991,1,2012-02-07 12:14:50,84b0c66349ae3c807f1d4601bfc0e8f6,280f0cedda5c4b171ee6245889659571,6eac7a7e347c563e5e00086f5eb47903,Ikebukuro Kagurazaka-Akabane
168992,1,2012-02-28 15:43:21,f7b2b854457ae6ece44be04c32520064,280f0cedda5c4b171ee6245889659571,a406e389e35e1140e4bc9b472d8258df,"Ginza, Shinbashi, Tokyo, Ueno"
168993,1,2012-03-19 12:11:16,e12f28eb208f5466dede7a7cb2fc566b,280f0cedda5c4b171ee6245889659571,36addcc4b958135895c859d8783e3cd2,"Ginza, Shinbashi, Tokyo, Ueno"
168994,2,2012-04-12 12:27:34,bcade77b186543a4820b3a6e3c06ad2f,280f0cedda5c4b171ee6245889659571,cb0244705306aafccc47bfe62ece39d3,"Shinjuku, Takadanobaba Nakano - Kichijoji"


In [None]:
df_area_test['SMALL_AREA_NAME_EN'] = df_area_test['SMALL_AREA_NAME'].map(small_area_dict)
df_area_test['PREF_NAME_EN'] = df_area_test['PREF_NAME'].map(pref_name_dict)
df_area_test

Unnamed: 0,SMALL_AREA_NAME,PREF_NAME,COUPON_ID_hash,SMALL_AREA_NAME_en_t,PREF_NAME_en_t,SMALL_AREA_NAME_EN,PREF_NAME_EN
0,京都,京都府,c76ea297ebd3a5a4d3bf9f75269f66fa,Kyoto,Kyoto.,Kyoto,Kyoto
1,ミナミ他,大阪府,c76ea297ebd3a5a4d3bf9f75269f66fa,Minashima,Osaka.,Minami other,Osaka prefecture
2,銀座・新橋・東京・上野,東京都,dd74dc95ca294afa02db40a543ae1763,"Ginza, New Bridge, Tokyo, Ueno.",Tokyo City,"Ginza, Shinbashi, Tokyo, Ueno",Tokyo
3,川崎・湘南・箱根他,神奈川県,c65b550cbef918796ad53b1d5b7165c1,I'll be right back. I'll be right back. I'll b...,Kanagawa,"Kawasaki, Shonan, Hakone, etc",Kanagawa Prefecture
4,埼玉,埼玉県,c65b550cbef918796ad53b1d5b7165c1,埼玉,Zheng-yang,Saitama,Saitama Prefecture
...,...,...,...,...,...,...,...
2160,ミナミ他,大阪府,f9c657ce7ca80b3766ced3a9a3c709bb,Minashima,Osaka.,Minami other,Osaka prefecture
2161,福井,福井県,f9c657ce7ca80b3766ced3a9a3c709bb,Fukui,Fukui,Fukui,Fukui Prefecture
2162,鳥取,鳥取県,f9c657ce7ca80b3766ced3a9a3c709bb,Birdcatcher.,Torigo,Tottori,Tottori Prefecture
2163,滋賀,滋賀県,f9c657ce7ca80b3766ced3a9a3c709bb,滋賀,Kaji prefecture,Shiga,Shiga Prefecture


In [None]:
df_area_test = df_area_test.drop(columns=['SMALL_AREA_NAME', 'SMALL_AREA_NAME_en_t', 'PREF_NAME', 'PREF_NAME_en_t'])
df_area_test.head()

Unnamed: 0,COUPON_ID_hash,SMALL_AREA_NAME_EN,PREF_NAME_EN
0,c76ea297ebd3a5a4d3bf9f75269f66fa,Kyoto,Kyoto
1,c76ea297ebd3a5a4d3bf9f75269f66fa,Minami other,Osaka prefecture
2,dd74dc95ca294afa02db40a543ae1763,"Ginza, Shinbashi, Tokyo, Ueno",Tokyo
3,c65b550cbef918796ad53b1d5b7165c1,"Kawasaki, Shonan, Hakone, etc",Kanagawa Prefecture
4,c65b550cbef918796ad53b1d5b7165c1,Saitama,Saitama Prefecture


In [None]:
df_visit_train.head()

Unnamed: 0,PURCHASE_FLG,I_DATE,PAGE_SERIAL,REFERRER_hash,VIEW_COUPON_ID_hash,USER_ID_hash,SESSION_ID_hash,PURCHASEID_hash
0,0,2012-03-28 14:15:00,7,7d3892e54acb559ae36c459978489330,34c48f84026e08355dc3bd19b427f09a,d9dca3cb44bab12ba313eaa681f663eb,673af822615593249e7c6a9a1a6bbb1a,
1,0,2012-03-28 14:17:28,9,7d3892e54acb559ae36c459978489330,34c48f84026e08355dc3bd19b427f09a,d9dca3cb44bab12ba313eaa681f663eb,673af822615593249e7c6a9a1a6bbb1a,
2,0,2012-03-28 14:20:05,16,7d3892e54acb559ae36c459978489330,17c450c3b470c045d35ec22b02daa690,d9dca3cb44bab12ba313eaa681f663eb,673af822615593249e7c6a9a1a6bbb1a,
3,0,2012-03-28 14:23:16,18,7d3892e54acb559ae36c459978489330,91a15e6a95d09e5e01b50747833b317d,d9dca3cb44bab12ba313eaa681f663eb,673af822615593249e7c6a9a1a6bbb1a,
4,0,2012-03-28 14:26:25,20,7d3892e54acb559ae36c459978489330,96fcbc8f6e45d5a2de1661eb140c6e82,d9dca3cb44bab12ba313eaa681f663eb,673af822615593249e7c6a9a1a6bbb1a,


In [None]:
df_locations['PREF_NAME_EN'] = df_locations['PREF_NAME'].map(pref_name_dict)

po_jp = df_locations['PREFECTUAL_OFFICE'].unique()

In [None]:
po_jp

array(['札幌市', '青森市', '盛岡市', '仙台市', '秋田市', '山形市', '福島市', '水戸市', '宇都宮市',
       '前橋市', 'さいたま市', '千葉市', '新宿区', '横浜市', '新潟市', '富山市', '金沢市', '福井市',
       '甲府市', '長野市', '岐阜市', '静岡市', '名古屋市', '津市', '大津市', '京都市', '大阪市',
       '神戸市', '奈良市', '和歌山市', '鳥取市', '松江市', '岡山市', '広島市', '山口市', '徳島市',
       '高松市', '松山市', '高知市', '福岡市', '佐賀市', '長崎市', '熊本市', '大分市', '宮崎市',
       '鹿児島市', '那覇市'], dtype=object)

In [None]:
po_en = ['Sapporo', 'Aomori City', 'Morioka City', 'Sendai City', 'Akita City', 'Yamagata City', 'Fukushima City', 'Mito City', 'Utsunomiya City',
         'Maebashi', 'Saitama City', 'Chiba', 'Shinjuku ward', 'Yokohama City', 'Niigata City', 'Toyama City', 'Kanazawa', 'Fukui City',
         'Kofu City', 'Nagano City', 'Gifu City', 'Shizuoka City', 'Nagoya City', 'Tsu City', 'Otsu City', 'Kyoto City', 'Osaka City',
         'Kobe City', 'Nara City', 'Wakayama City', 'Tottori City', 'Matsue', 'Okayama City', 'Hiroshima City', 'Yamaguchi City', 'Tokushima City', 
         'Takamatsu City', 'Matsuyama City', 'Kochi City', 'Fukuoka City', 'Saga City', 'Nagasaki City', 'Kumamoto City', 'Oita City', 'Miyazaki City', 
         'Kagoshima City', 'Naha City']

assert len(po_jp) == len(po_en)

pref_office_dict = {k:v for k, v in zip(po_jp, po_en)}
df_locations['PREFECTUAL_OFFICE_EN'] = df_locations['PREFECTUAL_OFFICE'].map(pref_office_dict)
df_locations

Unnamed: 0,PREF_NAME,PREFECTUAL_OFFICE,LATITUDE,LONGITUDE,PREF_NAME_en_t,PREFECTUAL_OFFICE_en_t,PREF_NAME_EN,PREFECTUAL_OFFICE_EN
0,北海道,札幌市,43.063968,141.347899,Hokkaido,Yokohama City,Hokkaido,Sapporo
1,青森県,青森市,40.824623,140.740593,Aomori,Aomori City,Aomori Prefecture,Aomori City
2,岩手県,盛岡市,39.703531,141.152667,Iwatea,And yet there is more to it than that.,Iwate Prefecture,Morioka City
3,宮城県,仙台市,38.268839,140.872103,Miyagi,Sendai City,Miyagi Prefecture,Sendai City
4,秋田県,秋田市,39.7186,140.102334,Akita,Akita City,Akita,Akita City
5,山形県,山形市,38.240437,140.363634,Hierarchy,Mountain City.,Yamagata Prefecture,Yamagata City
6,福島県,福島市,37.750299,140.467521,Fukushima,Fukushima City,Fukushima Prefecture,Fukushima City
7,茨城県,水戸市,36.341813,140.446793,Zhengji prefecture,"(For fully formatted text, see publication)",Ibaraki Prefecture,Mito City
8,栃木県,宇都宮市,36.565725,139.883565,Tsai,Inomiya City.,Tochigi Prefecture,Utsunomiya City
9,群馬県,前橋市,36.391208,139.060156,Cycling prefectures,Front Bridge City,Gunma Prefecture,Maebashi


In [None]:
df_locations = df_locations.drop(columns=['PREF_NAME', 'PREFECTUAL_OFFICE', 'PREF_NAME_en_t', 'PREFECTUAL_OFFICE_en_t'])
df_locations

Unnamed: 0,LATITUDE,LONGITUDE,PREF_NAME_EN,PREFECTUAL_OFFICE_EN
0,43.063968,141.347899,Hokkaido,Sapporo
1,40.824623,140.740593,Aomori Prefecture,Aomori City
2,39.703531,141.152667,Iwate Prefecture,Morioka City
3,38.268839,140.872103,Miyagi Prefecture,Sendai City
4,39.7186,140.102334,Akita,Akita City
5,38.240437,140.363634,Yamagata Prefecture,Yamagata City
6,37.750299,140.467521,Fukushima Prefecture,Fukushima City
7,36.341813,140.446793,Ibaraki Prefecture,Mito City
8,36.565725,139.883565,Tochigi Prefecture,Utsunomiya City
9,36.391208,139.060156,Gunma Prefecture,Maebashi


In [None]:
# Save CSV files to translated output. 
!mkdir data_translated
dir = 'data_translated'



In [None]:
# df_users.to_csv(f'{dir}/user_list.csv')
# df_area_test.to_csv(f'{dir}/coupon_area_test.csv')
# df_area_train.to_csv(f'{dir}/coupon_area_train.csv')
# df_c_detail_train.to_csv(f'{dir}/coupon_detail_train.csv')
# df_c_list_test.to_csv(f'{dir}/coupon_list_test.csv')
# df_c_list_train.to_csv(f'{dir}/coupon_list_train.csv')
# df_visit_train.to_csv(f'{dir}/coupon_visit_train.csv')
# df_locations.to_csv(f'{dir}/prefecture_locations.csv')

!zip -r translated_data.zip data_translated/

  adding: data_translated/ (stored 0%)
  adding: data_translated/coupon_visit_train.csv (deflated 77%)
  adding: data_translated/coupon_list_train.csv (deflated 79%)
  adding: data_translated/prefecture_locations.csv (deflated 57%)
  adding: data_translated/coupon_area_test.csv (deflated 86%)
  adding: data_translated/coupon_detail_train.csv (deflated 64%)
  adding: data_translated/coupon_area_train.csv (deflated 87%)
  adding: data_translated/user_list.csv (deflated 57%)
  adding: data_translated/coupon_list_test.csv (deflated 78%)
