# Load data

We detected two types of encoding on the training data.

# Chinese encoding and different formats

In [9]:
import json
import pandas as pd
import numpy as np
import magic
import os
from unidecode import unidecode

In [69]:
blob = open('../data/train/tournament_10.json', 'rb').read()
m = magic.Magic(mime_encoding=True)
encoding = m.from_buffer(blob)
print(encoding)

us-ascii


In [81]:
blob = open('../data/train/tournament_15.json', 'rb').read()
m = magic.Magic(mime_encoding=True)
encoding = m.from_buffer(blob)
print(encoding)

utf-8


We have a mixture of encondings so we want to unify all names under the same enconding. We are going to transform into ascii those which are Utf-8 using *unidecode*.

# Tranforming the data

In [23]:
with open("../data/train/tournament_15.json", "rb") as read_it:
     data = json.load(read_it)

In [24]:
t_15 = pd.json_normalize(data)
print(t_15.head())

            name  start_date    end_date  tours time_control  \
0  tournament_15  2014-01-31  2014-02-09      9      classic   

                                        games.tour_1  \
0  [{'white': '田佳达', 'black': '张跃', 'date': '2014...   

                                        games.tour_2  \
0  [{'white': '张跃', 'black': '董迎婴', 'date': '2014...   

                                        games.tour_3  \
0  [{'white': '邵红从', 'black': '张跃', 'date': '2014...   

                                        games.tour_4  \
0  [{'white': '张跃', 'black': '朱于', 'date': '2014-...   

                                        games.tour_5  \
0  [{'white': '杨皓棋', 'black': '张跃', 'date': '2014...   

                                        games.tour_6  \
0  [{'white': '张跃', 'black': '梁良', 'date': '2014-...   

                                        games.tour_7  \
0  [{'white': '赵西勇', 'black': '张跃', 'date': '2014...   

                                        games.tour_8  \
0  [{'white': '张跃', 'bla

In [25]:
t_15.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 1 entries, 0 to 0
Data columns (total 14 columns):
 #   Column        Non-Null Count  Dtype 
---  ------        --------------  ----- 
 0   name          1 non-null      object
 1   start_date    1 non-null      object
 2   end_date      1 non-null      object
 3   tours         1 non-null      int64 
 4   time_control  1 non-null      object
 5   games.tour_1  1 non-null      object
 6   games.tour_2  1 non-null      object
 7   games.tour_3  1 non-null      object
 8   games.tour_4  1 non-null      object
 9   games.tour_5  1 non-null      object
 10  games.tour_6  1 non-null      object
 11  games.tour_7  1 non-null      object
 12  games.tour_8  1 non-null      object
 13  games.tour_9  1 non-null      object
dtypes: int64(1), object(13)
memory usage: 240.0+ bytes


In [62]:
test = t_15.explode('games.tour_1')
test = test[['start_date', 'end_date', 'time_control', 'games.tour_1']]
test.rename(columns={'games.tour_1': 'games'}, inplace=True)
print(test.head())

   start_date    end_date time_control  \
0  2014-01-31  2014-02-09      classic   
0  2014-01-31  2014-02-09      classic   
0  2014-01-31  2014-02-09      classic   
0  2014-01-31  2014-02-09      classic   
0  2014-01-31  2014-02-09      classic   

                                               games  
0  {'white': '田佳达', 'black': '张跃', 'date': '2014-...  
0  {'white': '董迎婴', 'black': '陈吉凝', 'date': '2014...  
0  {'white': '邵红从', 'black': '曾英兰', 'date': '2014...  
0  {'white': '朱于', 'black': '赵西勇', 'date': '2014-...  
0  {'white': '杨皓棋', 'black': '梁良', 'date': '2014-...  


In [63]:
final = pd.concat([test.drop(['games'], axis=1), test['games'].apply(pd.Series)], axis=1)
final

Unnamed: 0,start_date,end_date,time_control,white,black,date,result,id
0,2014-01-31,2014-02-09,classic,田佳达,张跃,2014-01-31,1.0,tournament_15_1
0,2014-01-31,2014-02-09,classic,董迎婴,陈吉凝,2014-01-31,0.5,tournament_15_2
0,2014-01-31,2014-02-09,classic,邵红从,曾英兰,2014-01-31,0.5,tournament_15_3
0,2014-01-31,2014-02-09,classic,朱于,赵西勇,2014-01-31,0.5,tournament_15_4
0,2014-01-31,2014-02-09,classic,杨皓棋,梁良,2014-01-31,0.5,tournament_15_5


In [67]:
final['white'] = final['white'].apply(lambda x: unidecode(x))
final

Unnamed: 0,start_date,end_date,time_control,white,black,date,result,id
0,2014-01-31,2014-02-09,classic,Tian Jia Da,张跃,2014-01-31,1.0,tournament_15_1
0,2014-01-31,2014-02-09,classic,Dong Ying Ying,陈吉凝,2014-01-31,0.5,tournament_15_2
0,2014-01-31,2014-02-09,classic,Shao Hong Cong,曾英兰,2014-01-31,0.5,tournament_15_3
0,2014-01-31,2014-02-09,classic,Zhu Yu,赵西勇,2014-01-31,0.5,tournament_15_4
0,2014-01-31,2014-02-09,classic,Yang Hao Qi,梁良,2014-01-31,0.5,tournament_15_5


There is a difference between the names written in Ascii and those converted from Utf-8. We need to have a maximum two words, i.e., surname and name. However we have three words at the transformed players names so if we find that there are three words, we need to merge the last two to build the name.

In [107]:
def build_name(x):
    full_name = unidecode(x).rstrip()
    words = full_name.split(" ")
    surname = words[0]
    if len(words) == 3:
        name = words[1] + words[2].lower()
        return surname + ', ' + name
    return surname + ', ' + words[1]

def build_dataframe(tour_data, utf_8 = False):
    final_df = pd.DataFrame()
    nr_tour = tour_data['tours'].values[0]
    prefix = 'games.tour_'
    for i in range(1, nr_tour):
        column_name = prefix + str(i)
        print("Extracting games from "+column_name)
        temp = tour_data.explode(column_name)
        temp = temp[['start_date', 'end_date', 'time_control', column_name]]
        temp.rename(columns={column_name: 'games'}, inplace=True)
        final = pd.concat([temp.drop(['games'], axis=1), temp['games'].apply(pd.Series)], axis=1)
        if utf_8:
            final['white'] = final['white'].apply(lambda x: build_name(x))
            final['black'] = final['black'].apply(lambda x: build_name(x))
        final_df = pd.concat([final_df, final])
    return final_df

In [None]:
m = magic.Magic(mime_encoding=True)
with open("../data/train/tournament_10.json", "rb") as read_it:
    blob = open('../train/tournament_10.json', 'rb').read()
    encoding = m.from_buffer(blob)
    raw = json.load(read_it)
    data = pd.json_normalize(raw)
    test = build_dataframe(data, encoding =='utf-8')

In [99]:
print(test.head())

   start_date    end_date time_control          white           black  \
0  2014-01-15  2014-02-08        rapid    Liang, Qing     Tian, Xiong   
0  2014-01-15  2014-02-08        rapid      Wei, Ming  Pang, Ronggang   
0  2014-01-15  2014-02-08        rapid   Guo, Hongmin   Wang, Jianyao   
0  2014-01-15  2014-02-08        rapid    Xu, Xiaozhi     Wu, Haifang   
0  2014-01-15  2014-02-08        rapid  Zhang, Yajuan       Wang, Jin   

         date  result               id  
0  2014-01-15     0.5  tournament_10_1  
0  2014-01-15     1.0  tournament_10_2  
0  2014-01-15     0.0  tournament_10_3  
0  2014-01-15     1.0  tournament_10_4  
0  2014-01-15     0.0  tournament_10_5  


In [80]:
test.info()

<class 'pandas.core.frame.DataFrame'>
Int64Index: 275 entries, 0 to 0
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   start_date    275 non-null    object 
 1   end_date      275 non-null    object 
 2   time_control  275 non-null    object 
 3   white         275 non-null    object 
 4   black         275 non-null    object 
 5   date          275 non-null    object 
 6   result        275 non-null    float64
 7   id            275 non-null    object 
dtypes: float64(1), object(7)
memory usage: 19.3+ KB
None


Testing for the other file.

In [None]:
with open("../data/train/tournament_15.json", "rb") as read_it:
    blob = open('../data/train/tournament_15.json', 'rb').read()
    encoding = m.from_buffer(blob)
    raw = json.load(read_it)
    data = pd.json_normalize(raw)
    test = build_dataframe(data, encoding =='utf-8')

In [109]:
print(test.head())

   start_date    end_date time_control           white          black  \
0  2014-01-31  2014-02-09      classic     Tian, Jiada     Zhang, Yue   
0  2014-01-31  2014-02-09      classic  Dong, Yingying   Chen, Jining   
0  2014-01-31  2014-02-09      classic  Shao, Hongcong  Ceng, Yinglan   
0  2014-01-31  2014-02-09      classic         Zhu, Yu   Zhao, Xiyong   
0  2014-01-31  2014-02-09      classic     Yang, Haoqi   Liang, Liang   

         date  result               id  
0  2014-01-31     1.0  tournament_15_1  
0  2014-01-31     0.5  tournament_15_2  
0  2014-01-31     0.5  tournament_15_3  
0  2014-01-31     0.5  tournament_15_4  
0  2014-01-31     0.5  tournament_15_5  


In [84]:
print(test.info())

<class 'pandas.core.frame.DataFrame'>
Int64Index: 40 entries, 0 to 0
Data columns (total 8 columns):
 #   Column        Non-Null Count  Dtype  
---  ------        --------------  -----  
 0   start_date    40 non-null     object 
 1   end_date      40 non-null     object 
 2   time_control  40 non-null     object 
 3   white         40 non-null     object 
 4   black         40 non-null     object 
 5   date          40 non-null     object 
 6   result        40 non-null     float64
 7   id            40 non-null     object 
dtypes: float64(1), object(7)
memory usage: 2.8+ KB
None


In [110]:
test.tail()

Unnamed: 0,start_date,end_date,time_control,white,black,date,result,id
0,2014-01-31,2014-02-09,classic,"Zhang, Yue","Ceng, Yinglan",2014-02-07,0.5,tournament_15_36
0,2014-01-31,2014-02-09,classic,"Zhao, Xiyong","Chen, Jining",2014-02-07,1.0,tournament_15_37
0,2014-01-31,2014-02-09,classic,"Tian, Jiada","Liang, Liang",2014-02-07,0.5,tournament_15_38
0,2014-01-31,2014-02-09,classic,"Dong, Yingying","Yang, Haoqi",2014-02-07,0.0,tournament_15_39
0,2014-01-31,2014-02-09,classic,"Shao, Hongcong","Zhu, Yu",2014-02-07,0.0,tournament_15_40


## Testing files

We need to find the way to invert the transformation used to read the json file.

In [72]:
import json
with open("../data/test/tournament_test_2.json", "rb") as read_it:
     raw = json.load(read_it)

In [73]:
import magic
m = magic.Magic(mime_encoding=True)
with open("../data/test/tournament_test_2.json", "rb") as read_it:
    blob = open("../data/test/tournament_test_2.json", 'rb').read()
    encoding = m.from_buffer(blob)
    print(encoding)

utf-8


In [None]:
raw

In [76]:
import pandas as pd
data = pd.json_normalize(raw)

In [77]:
data.head()

Unnamed: 0,name,start_date,end_date,tours,time_control,games.tour_1,games.tour_2,games.tour_3,games.tour_4,games.tour_5,games.tour_6,games.tour_7,games.tour_8,games.tour_9,games.tour_10
0,tournament_test_2,2020-01-20,2020-02-09,10,classic,"[{'white': '吕亚光', 'black': '胡建静', 'date': '202...","[{'white': '陈荣顺', 'black': '吕亚光', 'date': '202...","[{'white': '崔军凯', 'black': '陈荣顺', 'date': '202...","[{'white': '陈荣顺', 'black': '丁晨彤', 'date': '202...","[{'white': '傅星菲', 'black': '陈荣顺', 'date': '202...","[{'white': '陈荣顺', 'black': '魏辟涛', 'date': '202...","[{'white': '黄凯文', 'black': '陈荣顺', 'date': '202...","[{'white': '陈荣顺', 'black': '张春', 'date': '2020...","[{'white': '董海非', 'black': '陈荣顺', 'date': '202...","[{'white': '陈荣顺', 'black': '陈森', 'date': '2020..."


In [78]:
data['games.tour_1'][0]

[{'white': '吕亚光',
  'black': '胡建静',
  'date': '2020-01-20',
  'id': 'tournament_test_2_1'},
 {'white': '崔军凯',
  'black': '陈森',
  'date': '2020-01-20',
  'id': 'tournament_test_2_2'},
 {'white': '丁晨彤',
  'black': '董海非',
  'date': '2020-01-20',
  'id': 'tournament_test_2_3'},
 {'white': '傅星菲',
  'black': '张春',
  'date': '2020-01-20',
  'id': 'tournament_test_2_4'},
 {'white': '魏辟涛',
  'black': '黄凯文',
  'date': '2020-01-20',
  'id': 'tournament_test_2_5'}]

In [88]:
t = data['games.tour_1'][0][0]['white'].encode('utf-8')
t.decode('utf-8')

'吕亚光'

In [89]:
for a in data['games.tour_1'][0][0]['white']:
    print(a)

吕
亚
光


In [90]:
characters = []
for a in data['games.tour_1'][0][0]['white']:
    characters.append(a)
characters

['吕', '亚', '光']

In [111]:
len(characters)

3

In [99]:
new_name = " ".join(characters)
new_name

'吕 亚 光'

In [110]:
build_name(new_name, True)

吕 亚 光
Lu  Ya  Guang 
['Lu', 'Ya', 'Guang']


'Lu Yaguang'

In [96]:
new_name = new_name.rstrip()
unidecode(new_name).split(" ")

['Lu', '', 'Ya', '', 'Guang', '']

In [None]:
for a in data['games.tour_1'][0][0]['white']:
    print(build_name(a,))

In [109]:
from unidecode import unidecode
import magic
def build_name(x, utf_8=False):
    '''
    Auxiliary function to build the player's name removing commas and under the same ascii encoding
    :param x: the name of the player as a string
    :param utf_8: True if Utf-8 encodings are used
    :return: the clean player's name in Ascii
    '''
    # remove additional whitespaces at the end of the string or at the beginning
    x = x.rstrip()
    x = x.lstrip()
    # remove commas
    x = x.replace(",","")
    print(x)
    if utf_8:
        full_name = unidecode(x)
    else:
        full_name = x
    print(full_name)
    full_name = full_name.replace('  ', ' ')
    full_name = full_name.rstrip()
    full_name = full_name.lstrip()
    words = full_name.split(" ")
    print(words)
    surname = words[0]

    if len(words) == 3:
        name = words[1] + words[2].lower()
        new_name = surname + ' ' + name
        return new_name
    new_name = surname + ' ' + words[1]
    return new_name

In [83]:
build_name(data['games.tour_1'][0][0]['white'], True)

'Lu Ya'

In [70]:
def add_result(data, column_name, result, encoding):
    new_tour = []
    for d in data[column_name][0]:
        if encoding == 'utf-8':
            d['white'] = unidecode(d['white'])
            d['black'] = unidecode(d['black'])
        d['result'] = 0.5
        new_tour.append(d)
    return new_tour
    #print(new_tour_1)

In [None]:
nr_tour = data['tours'].values[0]
prefix = 'games.tour_'
for i in range(1, nr_tour):
    column_name = prefix + str(i)
    new_tour = add_result(data, column_name, 0.5, 'utf-8')
    data.loc[data['name']=='tournament_test_0',column_name][0] = new_tour

In [60]:
def df_to_formatted_json(df, sep="."):
    """
    The opposite of json_normalize
    """
    result = []
    for idx, row in df.iterrows():
        parsed_row = {}
        for col_label,v in row.items():
            keys = col_label.split(".")

            current = parsed_row
            for i, k in enumerate(keys):
                if i==len(keys)-1:
                    current[k] = v
                else:
                    if k not in current.keys():
                        current[k] = {}
                    current = current[k]
        # save
        result.append(parsed_row)
    return result[0]

In [61]:
raw_back = df_to_formatted_json(data)

In [None]:
raw_back

In [63]:
with open("tournament_test_0.json", "w") as fp:
    json.dump(raw_back, fp, ensure_ascii=False, indent=4)
    fp.close()

# Summary
The Chinese encoding of some names was challenging, specially those given in chinese characters instead of Ascii recognized letters. Besides the same names were sometimes given with one type of encoding and sometimes with others so we needed to identify them as the same player and not two different players. How we tackle this issues?

* unifying the encoding of all names thanks to the unicode library.
* removing commas and giving the same format for all names: "surname name".
* checking the final names and identifying redundancies, for instance with the character 'ü' that we discovered it was wrongly translated as 'v'.

# Resources
https://pypi.org/project/python-magic/

https://www.delftstack.com/howto/python/convert-unicode-to-ascii-python/