Skip to content

Replacement for deprecated codevct string conversion #12151

@CommanderLake

Description

@CommanderLake

I made these string conversion functions so codevct can be removed from common.cpp:

// Helper function to convert UTF-8 to UTF-32
std::u32string utf8_to_utf32(const std::string& input){
	std::u32string result;
	result.reserve(input.size()); // Reserve space (will likely need less)
	// Process input string byte by byte
	for(size_t i = 0; i<input.size(); i++){
		char32_t codepoint = 0;
		unsigned char c = static_cast<unsigned char>(input[i]);
		if(c<0x80){
			// 1-byte sequence: 0xxxxxxx
			codepoint = c;
		} else if((c&0xE0)==0xC0){
			// 2-byte sequence: 110xxxxx 10xxxxxx
			if(i+1>=input.size()||(input[i+1]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence");
			codepoint = ((c&0x1F)<<6)|(input[i+1]&0x3F);
			i += 1;
			// Check for overlong encoding
			if(codepoint<0x80) throw std::runtime_error("Overlong UTF-8 encoding");
		} else if((c&0xF0)==0xE0){
			// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
			if(i+2>=input.size()||(input[i+1]&0xC0)!=0x80||(input[i+2]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence");
			codepoint = ((c&0x0F)<<12)|((input[i+1]&0x3F)<<6)|(input[i+2]&0x3F);
			i += 2;
			// Check for overlong encoding or UTF-16 surrogates
			if(codepoint<0x800||(codepoint>=0xD800&&codepoint<=0xDFFF)) throw std::runtime_error("Invalid UTF-8 encoding");
		} else if((c&0xF8)==0xF0){
			// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
			if(i+3>=input.size()||(input[i+1]&0xC0)!=0x80||(input[i+2]&0xC0)!=0x80||(input[i+3]&0xC0)!=0x80) throw std::runtime_error("Invalid UTF-8 sequence");
			codepoint = ((c&0x07)<<18)|((input[i+1]&0x3F)<<12)|((input[i+2]&0x3F)<<6)|(input[i+3]&0x3F);
			i += 3;
			// Check for overlong encoding
			if(codepoint<0x10000||codepoint>0x10FFFF) throw std::runtime_error("Invalid UTF-8 encoding");
		} else{ throw std::runtime_error("Invalid UTF-8 sequence"); }
		result.push_back(codepoint);
	}
	return result;
}
// Helper function to convert UTF-32 to UTF-8
std::string utf32_to_utf8(const std::u32string& input){
	std::string result;
	result.reserve(input.size()*4); // Reserve max possible space
	for(char32_t c : input){
		// ASCII: 0xxxxxxx
		if(c<0x80){ result.push_back(static_cast<char>(c)); }
		// 2-byte sequence: 110xxxxx 10xxxxxx
		else if(c<0x800){
			result.push_back(static_cast<char>(0xC0|(c>>6)));
			result.push_back(static_cast<char>(0x80|(c&0x3F)));
		}
		// 3-byte sequence: 1110xxxx 10xxxxxx 10xxxxxx
		else if(c<0x10000){
			// Check for UTF-16 surrogate pairs which are invalid in UTF-32
			if(c>=0xD800&&c<=0xDFFF){ throw std::runtime_error("UTF-16 surrogate values are invalid in UTF-32"); }
			result.push_back(static_cast<char>(0xE0|(c>>12)));
			result.push_back(static_cast<char>(0x80|((c>>6)&0x3F)));
			result.push_back(static_cast<char>(0x80|(c&0x3F)));
		}
		// 4-byte sequence: 11110xxx 10xxxxxx 10xxxxxx 10xxxxxx
		else if(c<0x110000){
			result.push_back(static_cast<char>(0xF0|(c>>18)));
			result.push_back(static_cast<char>(0x80|((c>>12)&0x3F)));
			result.push_back(static_cast<char>(0x80|((c>>6)&0x3F)));
			result.push_back(static_cast<char>(0x80|(c&0x3F)));
		} else{ throw std::runtime_error("Invalid Unicode code point"); }
	}
	return result;
}
// UTF-8 to wstring conversion
static std::wstring utf8_to_wstring(const std::string& str){
	std::wstring result;
	result.reserve(str.size());
	for(size_t i = 0; i<str.size();){
		wchar_t wc = 0;
		unsigned char c = static_cast<unsigned char>(str[i]);
		if(c<0x80){
			// 1-byte sequence
			wc = c;
			i += 1;
		} else if((c&0xE0)==0xC0){
			// 2-byte sequence
			if(i+1>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence");
			wc = ((c&0x1F)<<6)|(static_cast<unsigned char>(str[i+1])&0x3F);
			i += 2;
		} else if((c&0xF0)==0xE0){
			// 3-byte sequence
			if(i+2>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence");
			wc = ((c&0x0F)<<12)|((static_cast<unsigned char>(str[i+1])&0x3F)<<6)|(static_cast<unsigned char>(str[i+2])&0x3F);
			i += 3;
		} else if((c&0xF8)==0xF0){
			// 4-byte sequence
			if(i+3>=str.size()) throw std::runtime_error("Invalid UTF-8 sequence");
			// For wchar_t, we'll use the replacement character for 4-byte sequences
			// as they're outside the BMP (Basic Multilingual Plane)
			wc = 0xFFFD;
			i += 4;
		} else{ throw std::runtime_error("Invalid UTF-8 sequence"); }
		result.push_back(wc);
	}
	return result;
}
// wstring to UTF-8 conversion
static std::string wstring_to_utf8(const std::wstring& wstr){
	std::string result;
	result.reserve(wstr.size()*3); // Reserve for worst case
	for(wchar_t wc : wstr){
		if(wc<0x80){ result.push_back(static_cast<char>(wc)); } else if(wc<0x800){
			result.push_back(static_cast<char>(0xC0|(wc>>6)));
			result.push_back(static_cast<char>(0x80|(wc&0x3F)));
		} else{
			result.push_back(static_cast<char>(0xE0|(wc>>12)));
			result.push_back(static_cast<char>(0x80|((wc>>6)&0x3F)));
			result.push_back(static_cast<char>(0x80|(wc&0x3F)));
		}
	}
	return result;
}

Metadata

Metadata

Assignees

No one assigned

    Labels

    Type

    No type

    Projects

    No projects

    Milestone

    No milestone

    Relationships

    None yet

    Development

    No branches or pull requests

    Issue actions