-
Notifications
You must be signed in to change notification settings - Fork 14.2k
Closed
Labels
Description
I made these string conversion functions so codevct can be removed from common.cpp:
// Helper function to convert UTF-8 to UTF-32
std::u32string utf8_to_utf32(const std::string& input){
std::u32string result;
result.reserve(input.size()); // Reserve space (will likely need less)
// Process input string byte by byte
for(size_t i = 0; i<input.size(); i++){
char32_t codepoint = 0;
unsigned char c = static_cast<unsigned char>(input[i]);
if(c<0x80){
// 1-byte sequence: 0xxxxxxx
codepoint = c;
} else if((c&0xE0)==0xC0){
// 2-byte sequence: 110xxxxx 10xxxxxx
if(i+1>=input.size()||(input[i+1]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence");
codepoint = ((c&0x1F)<<6)|(input[i+1]&0x3F);
i += 1;
// Check for overlong encoding
if(codepoint<0x80) throw std::runtime_error("Overlong UTF-8 encoding");
} else if((c&0xF0)==0xE0){
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
if(i+2>=input.size()||(input[i+1]&0xC0)!=0x80||(input[i+2]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence");
codepoint = ((c&0x0F)<<12)|((input[i+1]&0x3F)<<6)|(input[i+2]&0x3F);
i += 2;
// Check for overlong encoding or UTF-16 surrogates
if(codepoint<0x800||(codepoint>=0xD800&&codepoint<=0xDFFF)) throw std::runtime_error("Invalid UTF-8 encoding");
} else if((c&0xF8)==0xF0){
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
if(i+3>=input.size()||(input[i+1]&0xC0)!=0x80||(input[i+2]&0xC0)!=0x80||(input[i+3]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence");
codepoint = ((c&0x07)<<18)|((input[i+1]&0x3F)<<12)|((input[i+2]&0x3F)<<6)|(input[i+3]&0x3F);
i += 3;
// Check for overlong encoding
if(codepoint<0x10000||codepoint>0x10FFFF) throw std::runtime_error("Invalid UTF-8 encoding");
} else{ throw std::runtime_error("Invalid UTF-8 sequence"); }
result.push_back(codepoint);
}
return result;
}
// Helper function to convert UTF-32 to UTF-8
std::string utf32_to_utf8(const std::u32string& input){
std::string result;
result.reserve(input.size()*4); // Reserve max possible space
for(char32_t c : input){
// ASCII: 0xxxxxxx
if(c<0x80){ result.push_back(static_cast<char>(c)); }
// 2-byte sequence: 110xxxxx 10xxxxxx
else if(c<0x800){
result.push_back(static_cast<char>(0xC0|(c>>6)));
result.push_back(static_cast<char>(0x80|(c&0x3F)));
}
// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
else if(c<0x10000){
// Check for UTF-16 surrogate pairs which are invalid in UTF-32
if(c>=0xD800&&c<=0xDFFF){ throw std::runtime_error("UTF-16 surrogate values are invalid in UTF-32"); }
result.push_back(static_cast<char>(0xE0|(c>>12)));
result.push_back(static_cast<char>(0x80|((c>>6)&0x3F)));
result.push_back(static_cast<char>(0x80|(c&0x3F)));
}
// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
else if(c<0x110000){
result.push_back(static_cast<char>(0xF0|(c>>18)));
result.push_back(static_cast<char>(0x80|((c>>12)&0x3F)));
result.push_back(static_cast<char>(0x80|((c>>6)&0x3F)));
result.push_back(static_cast<char>(0x80|(c&0x3F)));
} else{ throw std::runtime_error("Invalid Unicode code point"); }
}
return result;
}
// UTF-8 to wstring conversion
static std::wstring utf8_to_wstring(const std::string& str){
std::wstring result;
result.reserve(str.size());
for(size_t i = 0; i<str.size();){
wchar_t wc = 0;
unsigned char c = static_cast<unsigned char>(str[i]);
if(c<0x80){
// 1-byte sequence
wc = c;
i += 1;
} else if((c&0xE0)==0xC0){
// 2-byte sequence
if(i+1>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence");
wc = ((c&0x1F)<<6)|(static_cast<unsigned char>(str[i+1])&0x3F);
i += 2;
} else if((c&0xF0)==0xE0){
// 3-byte sequence
if(i+2>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence");
wc = ((c&0x0F)<<12)|((static_cast<unsigned char>(str[i+1])&0x3F)<<6)|(static_cast<unsigned char>(str[i+2])&0x3F);
i += 3;
} else if((c&0xF8)==0xF0){
// 4-byte sequence
if(i+3>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence");
// For wchar_t, we'll use the replacement character for 4-byte sequences
// as they're outside the BMP (Basic Multilingual Plane)
wc = 0xFFFD;
i += 4;
} else{ throw std::runtime_error("Invalid UTF-8 sequence"); }
result.push_back(wc);
}
return result;
}
// wstring to UTF-8 conversion
static std::string wstring_to_utf8(const std::wstring& wstr){
std::string result;
result.reserve(wstr.size()*3); // Reserve for worst case
for(wchar_t wc : wstr){
if(wc<0x80){ result.push_back(static_cast<char>(wc)); } else if(wc<0x800){
result.push_back(static_cast<char>(0xC0|(wc>>6)));
result.push_back(static_cast<char>(0x80|(wc&0x3F)));
} else{
result.push_back(static_cast<char>(0xE0|(wc>>12)));
result.push_back(static_cast<char>(0x80|((wc>>6)&0x3F)));
result.push_back(static_cast<char>(0x80|(wc&0x3F)));
}
}
return result;
}