# Character Encodings and Detection with Python, chardet, and cchardet
- Ref: https://dev.to/bowmanjd/character-encodings-and-detection-with-python-chardet-and-cchardet-4hj7

In [1]:
print("\x73\x70\x61\x6d")

spam


In [2]:
b"\x73\x70\x61\x6d".decode("ascii")

'spam'

In [32]:
b"\x4a\x6f\x73\xe9".decode("ascii")

UnicodeDecodeError: 'ascii' codec can't decode byte 0xe9 in position 3: ordinal not in range(128)

In [33]:
b"\x4a\x6f\x73\xe9".decode("utf-8")

UnicodeDecodeError: 'utf-8' codec can't decode byte 0xe9 in position 3: unexpected end of data

In [5]:
b"\x4a\x6f\x73\xe9".decode("iso-8859-1")

'José'

## Character Encoding detection using chardet

### Method-1 using chardet.detect

In [10]:
import chardet
name = b"\x4a\x6f\x73\xe9"
detection = chardet.detect(name)
print(detection)
encoding_name = detection["encoding"]
print(name.decode(encoding_name))

{'encoding': 'ISO-8859-1', 'confidence': 0.73, 'language': ''}
José


### Method-2 detecting character encoding from file

In [31]:
from pathlib import Path
import chardet
import sys
import time
t1 = time.time()
# We must read as binary (bytes) because we don't yet know encoding
filename = 'sample-utf8.csv' # sample-latin1.csv' #
filepath = Path(filename)
blob = filepath.read_bytes()

detection = chardet.detect(blob)
encoding = detection["encoding"]
confidence = detection["confidence"]
text = blob.decode(encoding)
t2 = time.time()
print(' text= {} \n encoding= {}\n confidence= {}, time taken : {} sec'.format(text, encoding, confidence, (t2-t1)))

 text= Last name,First name
Eldjárn,Kristján
González,José
Hernández,Fátima
Kazanci,Miraç
Rivera,Tomás
Sánchez,Aarón
Vargas,Jesús
Öksüz,Yasin
 
 encoding= utf-8
 confidence= 0.99, time taken : 0.002992868423461914 sec


## Character Encoding detection using cchardet
-- Faster than chardet

In [30]:
import cchardet
from pathlib import Path
import sys
import time
# We must read as binary (bytes) because we don't yet know encoding
t1 = time.time()
filename = 'sample-utf8.csv' # sample-latin1.csv' #
filepath = Path(filename)
blob = filepath.read_bytes()

detection = cchardet.detect(blob)
encoding = detection["encoding"]
confidence = detection["confidence"]
text = blob.decode(encoding)
t2 = time.time()
print(' text= {} \n encoding= {}\n confidence= {}, time taken : {} sec'.format(text, encoding, confidence, (t2-t1)))

 text= Last name,First name
Eldjárn,Kristján
González,José
Hernández,Fátima
Kazanci,Miraç
Rivera,Tomás
Sánchez,Aarón
Vargas,Jesús
Öksüz,Yasin
 
 encoding= UTF-8
 confidence= 0.9900000095367432, time taken : 0.00299072265625 sec
