-
Notifications
You must be signed in to change notification settings - Fork 5
Commit
This commit does not belong to any branch on this repository, and may belong to a fork outside of the repository.
Helps with #1.
- Loading branch information
Showing
7 changed files
with
359 additions
and
219,716 deletions.
There are no files selected for viewing
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Original file line number | Diff line number | Diff line change |
---|---|---|
@@ -1,132 +1,173 @@ | ||
using System; | ||
using System.IO; | ||
using System.Linq; | ||
using System.Runtime.InteropServices; | ||
using FastString.Unicode; | ||
|
||
namespace FastString.DataGen | ||
{ | ||
class MainClass | ||
{ | ||
static readonly char[] separator = { ';' }; | ||
static readonly utf8 CharInfoFormat = new utf8(@" new CharInfo | ||
{{ | ||
Codepoint = 0x{0}, | ||
Category = UnicodeCategory.{1}, | ||
Mirrored = {2}"); | ||
static readonly utf8 IsMirrored = new utf8("Y"); | ||
static readonly utf8 EndNameBytes = new utf8("}) /* {0} */"); | ||
static readonly utf8 EndInstance = new utf8(@" | ||
}, | ||
"); | ||
|
||
public static void Main(string[] args) | ||
{ | ||
// Data file available at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt | ||
new MainClass().DoThings(args); | ||
} | ||
|
||
private FileStream main; | ||
|
||
public void DoThings(string[] args) | ||
{ | ||
// Data file available at http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt | ||
var data = new utf8(File.ReadAllBytes(args[0])); | ||
|
||
var lines = new Splitter('\n', data); | ||
|
||
var mainOutFile = new FileStream("CharInfo.RawData.cs", FileMode.Create, FileAccess.Write); | ||
var main = new Utf8Writer(mainOutFile); | ||
main.Append(@" | ||
// This file is autogenerated by a tool. | ||
// Please do not edit it. | ||
using System.Collections.Generic; | ||
namespace FastString.Unicode | ||
{ | ||
public partial struct CharInfo | ||
{ | ||
public static IReadOnlyList<CharInfo> Characters = new CharInfo[] | ||
{ | ||
"); | ||
this.main = new FileStream("chardata", FileMode.Create, FileAccess.Write); | ||
var namesRaw = new FileStream("charnames", FileMode.Create, FileAccess.Write); | ||
var names = new Utf8Writer(namesRaw); | ||
// TODO normalization map | ||
int i = 0; | ||
long per = 0; | ||
foreach (var line in lines) | ||
{ | ||
i++; | ||
Console.WriteLine(i); | ||
if (line.IsEmpty) continue; | ||
var start = main.Position; | ||
Console.WriteLine("writing {0} at offset {1}", i, main.Position); | ||
var parts = line.Split(separator); | ||
|
||
// Schema given in ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html | ||
main.AppendFormat(CharInfoFormat, parts[0], CharInfo.ParseCategory(parts[2]), (parts[9] == IsMirrored ? "true" : "false")); | ||
if (parts[1].HasValue) | ||
{ | ||
// Everything should have a name... | ||
main.Append(@", | ||
Name = new utf8(new byte[]{"); | ||
foreach (var b in parts[1].Bytes) | ||
{ | ||
main.Append(b); | ||
main.Append(','); | ||
} | ||
main.AppendFormat(EndNameBytes, parts[1]); | ||
} | ||
// The current codepoint (eg U+00A2). | ||
WriteCodepoint(parts[0]); | ||
|
||
// Simple cases. | ||
if (parts[12].HasValue) | ||
{ | ||
main.Append(@", | ||
Uppercase = 0x"); | ||
main.Append(parts[12]); | ||
} | ||
if (parts[13].HasValue) | ||
// The name of this codepoint (eg LATIN SMALL LETTER E WITH MACRON). | ||
// | ||
WriteOffset(namesRaw.Position); | ||
names.Append(parts[1]); | ||
WriteOffset(namesRaw.Position); | ||
|
||
// The major category, like Ll or Sm | ||
WriteByte(ParseCategory(parts[2])); | ||
|
||
// Numeric value. | ||
WriteFloat(GetNumericValue(parts)); | ||
|
||
// Whether this thing is mirrored | ||
WriteByte((byte)(parts[9] == IsMirrored ? 1 : 0)); | ||
|
||
// Upper | ||
WriteCodepoint(parts[12], parts[0]); | ||
// Lower | ||
WriteCodepoint(parts[13], parts[0]); | ||
// Title | ||
WriteCodepoint(parts[14], parts[0]); | ||
var end = main.Position; | ||
if (per == 0) | ||
{ | ||
main.Append(@", | ||
Lowercase = 0x"); | ||
main.Append(parts[13]); | ||
per = end - start; | ||
} | ||
if (parts[14].HasValue) | ||
else | ||
{ | ||
main.Append(@", | ||
Titlecase = 0x"); | ||
main.Append(parts[14]); | ||
if (per != end - start) | ||
{ | ||
throw new Exception($"at entry $i, expected $per bytes written; actual was ${end - start}"); | ||
} | ||
} | ||
} | ||
main.Flush(); | ||
main.Close(); | ||
namesRaw.Flush(); | ||
namesRaw.Close(); | ||
} | ||
|
||
// Numeric value. | ||
// There are several different numeric values listed. | ||
if (parts[6].HasValue) | ||
{ | ||
main.Append(@", | ||
NumericValue = "); | ||
main.Append(parts[6]); | ||
} | ||
else if (parts[7].HasValue) | ||
float GetNumericValue(utf8[] parts) | ||
{ | ||
if (parts[6].HasValue) | ||
{ | ||
return utf8.ParseInt(parts[6]); | ||
} | ||
if (parts[7].HasValue) | ||
{ | ||
return utf8.ParseInt(parts[7]); | ||
} | ||
if (parts[8].HasValue) | ||
{ | ||
var p = parts[8].Split(new char[] { '/' }, 3); | ||
Console.WriteLine("float? value {0} in {1} parts", parts[8], p.Length); | ||
if (p.Length == 2) | ||
{ | ||
main.Append(@", | ||
NumericValue = "); | ||
main.Append(parts[7]); | ||
Console.WriteLine("{0} / {1}", p[0], p[1]); | ||
Console.WriteLine("{0} / {1}", p[0].Trim(), p[1].Trim()); | ||
return (utf8.ParseInt(p[0].Trim()) * 1.0f / utf8.ParseInt(p[1].Trim())); | ||
} | ||
else if (parts[8].HasValue) | ||
else | ||
{ | ||
main.Append(@", | ||
NumericValue = "); | ||
// Parse the subsequent rational number. | ||
var p = parts[8].Split(new char[] { '/' }, 3); | ||
if (p.Length == 1) | ||
{ | ||
main.Append(parts[8]); | ||
} | ||
else if (p.Length == 2) | ||
{ | ||
Console.WriteLine("fraction! [{0}] slash [{1}]", p[0], p[1]); | ||
Console.WriteLine("fraction! [{0}] slash [{1}]", p[0].Trim(), p[1].Trim()); | ||
main.Append(utf8.ParseInt(p[0].Trim()) * 1.0 / utf8.ParseInt(p[1].Trim())); | ||
} | ||
else | ||
{ | ||
Console.WriteLine("error on line {0}: can't parse rational number {1}", i, parts[8]); | ||
} | ||
return utf8.ParseInt(parts[8]); | ||
} | ||
} | ||
return float.NaN; | ||
} | ||
|
||
void WriteFloat(float v) | ||
{ | ||
FloatBytes f = new FloatBytes(); | ||
f.f = v; | ||
WriteUint32(f.i); | ||
} | ||
|
||
void WriteOffset(long v) | ||
{ | ||
WriteUint32((uint)v); | ||
} | ||
|
||
void WriteUint32(uint v) | ||
{ | ||
main.WriteByte((byte)((v >> 24) & 0xFF)); | ||
main.WriteByte((byte)((v >> 16) & 0xFF)); | ||
main.WriteByte((byte)((v >> 8) & 0xFF)); | ||
main.WriteByte((byte)((v >> 0) & 0xFF)); | ||
} | ||
|
||
void WriteByte(byte v) | ||
{ | ||
main.WriteByte(v); | ||
} | ||
|
||
main.Append(EndInstance); | ||
void WriteCodepoint(utf8 codepoint, utf8 backup) | ||
{ | ||
if (codepoint.HasValue) | ||
{ | ||
WriteCodepoint(codepoint); | ||
return; | ||
} | ||
main.Append(@" | ||
}; | ||
} | ||
} | ||
"); | ||
mainOutFile.Flush(); | ||
mainOutFile.Close(); | ||
WriteCodepoint(backup); | ||
} | ||
|
||
void WriteCodepoint(utf8 codepoint) | ||
{ | ||
if (!codepoint.HasValue) | ||
{ | ||
WriteUint32(0); | ||
return; | ||
} | ||
WriteUint32((uint)utf8.ParseInt(codepoint, 16)); | ||
} | ||
|
||
byte ParseCategory(utf8 str) | ||
{ | ||
return (byte)CharInfo.ParseCategory(str); | ||
} | ||
} | ||
|
||
[StructLayout(LayoutKind.Explicit, Pack=1)] | ||
struct FloatBytes | ||
{ | ||
[FieldOffset(0)] | ||
public float f; | ||
[FieldOffset(0)] | ||
public uint i; | ||
} | ||
} |
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
This file contains bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
Oops, something went wrong.