Skip to content

Commit

Permalink
Use resources to get unicode data.
Browse files Browse the repository at this point in the history
Helps with #1.
  • Loading branch information
dhasenan committed Apr 14, 2017
1 parent 510f498 commit 795e20f
Show file tree
Hide file tree
Showing 7 changed files with 359 additions and 219,716 deletions.
225 changes: 133 additions & 92 deletions FastString.DataGen/Program.cs
Original file line number Diff line number Diff line change
@@ -1,132 +1,173 @@
using System;
using System.IO;
using System.Linq;
using System.Runtime.InteropServices;
using FastString.Unicode;

namespace FastString.DataGen
{
class MainClass
{
static readonly char[] separator = { ';' };
static readonly utf8 CharInfoFormat = new utf8(@" new CharInfo
{{
Codepoint = 0x{0},
Category = UnicodeCategory.{1},
Mirrored = {2}");
static readonly utf8 IsMirrored = new utf8("Y");
static readonly utf8 EndNameBytes = new utf8("}) /* {0} */");
static readonly utf8 EndInstance = new utf8(@"
},
");

public static void Main(string[] args)
{
// Data file available at ftp://ftp.unicode.org/Public/UNIDATA/UnicodeData.txt
new MainClass().DoThings(args);
}

private FileStream main;

public void DoThings(string[] args)
{
// Data file available at http://www.unicode.org/Public/UCD/latest/ucd/UnicodeData.txt
var data = new utf8(File.ReadAllBytes(args[0]));

var lines = new Splitter('\n', data);

var mainOutFile = new FileStream("CharInfo.RawData.cs", FileMode.Create, FileAccess.Write);
var main = new Utf8Writer(mainOutFile);
main.Append(@"
// This file is autogenerated by a tool.
// Please do not edit it.
using System.Collections.Generic;
namespace FastString.Unicode
{
public partial struct CharInfo
{
public static IReadOnlyList<CharInfo> Characters = new CharInfo[]
{
");
this.main = new FileStream("chardata", FileMode.Create, FileAccess.Write);
var namesRaw = new FileStream("charnames", FileMode.Create, FileAccess.Write);
var names = new Utf8Writer(namesRaw);
// TODO normalization map
int i = 0;
long per = 0;
foreach (var line in lines)
{
i++;
Console.WriteLine(i);
if (line.IsEmpty) continue;
var start = main.Position;
Console.WriteLine("writing {0} at offset {1}", i, main.Position);
var parts = line.Split(separator);

// Schema given in ftp://ftp.unicode.org/Public/3.0-Update/UnicodeData-3.0.0.html
main.AppendFormat(CharInfoFormat, parts[0], CharInfo.ParseCategory(parts[2]), (parts[9] == IsMirrored ? "true" : "false"));
if (parts[1].HasValue)
{
// Everything should have a name...
main.Append(@",
Name = new utf8(new byte[]{");
foreach (var b in parts[1].Bytes)
{
main.Append(b);
main.Append(',');
}
main.AppendFormat(EndNameBytes, parts[1]);
}
// The current codepoint (eg U+00A2).
WriteCodepoint(parts[0]);

// Simple cases.
if (parts[12].HasValue)
{
main.Append(@",
Uppercase = 0x");
main.Append(parts[12]);
}
if (parts[13].HasValue)
// The name of this codepoint (eg LATIN SMALL LETTER E WITH MACRON).
//
WriteOffset(namesRaw.Position);
names.Append(parts[1]);
WriteOffset(namesRaw.Position);

// The major category, like Ll or Sm
WriteByte(ParseCategory(parts[2]));

// Numeric value.
WriteFloat(GetNumericValue(parts));

// Whether this thing is mirrored
WriteByte((byte)(parts[9] == IsMirrored ? 1 : 0));

// Upper
WriteCodepoint(parts[12], parts[0]);
// Lower
WriteCodepoint(parts[13], parts[0]);
// Title
WriteCodepoint(parts[14], parts[0]);
var end = main.Position;
if (per == 0)
{
main.Append(@",
Lowercase = 0x");
main.Append(parts[13]);
per = end - start;
}
if (parts[14].HasValue)
else
{
main.Append(@",
Titlecase = 0x");
main.Append(parts[14]);
if (per != end - start)
{
throw new Exception($"at entry $i, expected $per bytes written; actual was ${end - start}");
}
}
}
main.Flush();
main.Close();
namesRaw.Flush();
namesRaw.Close();
}

// Numeric value.
// There are several different numeric values listed.
if (parts[6].HasValue)
{
main.Append(@",
NumericValue = ");
main.Append(parts[6]);
}
else if (parts[7].HasValue)
float GetNumericValue(utf8[] parts)
{
if (parts[6].HasValue)
{
return utf8.ParseInt(parts[6]);
}
if (parts[7].HasValue)
{
return utf8.ParseInt(parts[7]);
}
if (parts[8].HasValue)
{
var p = parts[8].Split(new char[] { '/' }, 3);
Console.WriteLine("float? value {0} in {1} parts", parts[8], p.Length);
if (p.Length == 2)
{
main.Append(@",
NumericValue = ");
main.Append(parts[7]);
Console.WriteLine("{0} / {1}", p[0], p[1]);
Console.WriteLine("{0} / {1}", p[0].Trim(), p[1].Trim());
return (utf8.ParseInt(p[0].Trim()) * 1.0f / utf8.ParseInt(p[1].Trim()));
}
else if (parts[8].HasValue)
else
{
main.Append(@",
NumericValue = ");
// Parse the subsequent rational number.
var p = parts[8].Split(new char[] { '/' }, 3);
if (p.Length == 1)
{
main.Append(parts[8]);
}
else if (p.Length == 2)
{
Console.WriteLine("fraction! [{0}] slash [{1}]", p[0], p[1]);
Console.WriteLine("fraction! [{0}] slash [{1}]", p[0].Trim(), p[1].Trim());
main.Append(utf8.ParseInt(p[0].Trim()) * 1.0 / utf8.ParseInt(p[1].Trim()));
}
else
{
Console.WriteLine("error on line {0}: can't parse rational number {1}", i, parts[8]);
}
return utf8.ParseInt(parts[8]);
}
}
return float.NaN;
}

void WriteFloat(float v)
{
FloatBytes f = new FloatBytes();
f.f = v;
WriteUint32(f.i);
}

void WriteOffset(long v)
{
WriteUint32((uint)v);
}

void WriteUint32(uint v)
{
main.WriteByte((byte)((v >> 24) & 0xFF));
main.WriteByte((byte)((v >> 16) & 0xFF));
main.WriteByte((byte)((v >> 8) & 0xFF));
main.WriteByte((byte)((v >> 0) & 0xFF));
}

void WriteByte(byte v)
{
main.WriteByte(v);
}

main.Append(EndInstance);
void WriteCodepoint(utf8 codepoint, utf8 backup)
{
if (codepoint.HasValue)
{
WriteCodepoint(codepoint);
return;
}
main.Append(@"
};
}
}
");
mainOutFile.Flush();
mainOutFile.Close();
WriteCodepoint(backup);
}

void WriteCodepoint(utf8 codepoint)
{
if (!codepoint.HasValue)
{
WriteUint32(0);
return;
}
WriteUint32((uint)utf8.ParseInt(codepoint, 16));
}

byte ParseCategory(utf8 str)
{
return (byte)CharInfo.ParseCategory(str);
}
}

[StructLayout(LayoutKind.Explicit, Pack=1)]
struct FloatBytes
{
[FieldOffset(0)]
public float f;
[FieldOffset(0)]
public uint i;
}
}
33 changes: 26 additions & 7 deletions FastString.Test/CharInfoSmokeTest.cs
Original file line number Diff line number Diff line change
Expand Up @@ -7,15 +7,34 @@ namespace FastString.Test
[TestFixture]
public class CharInfoSmokeTest
{
[SetUp]
public void Setup()
{
CharInfo.LoadCharacterData();
CharInfo.LoadCharacterNames();
}

[Test]
public void AsciiTab()
{
var ci = CharInfo.For('\t').Value;
Assert.That(ci.Name, Is.EqualTo(new utf8("<control>")), ci.Name.ToString());
}

[Test]
public void AsciiDollar()
{
var ci = CharInfo.For('$').Value;
Assert.That(ci.Codepoint, Is.EqualTo((uint)0x24));
Assert.That(ci.Name, Is.EqualTo(new utf8("DOLLAR SIGN")), ci.Name.ToString());
Assert.That(ci.Category, Is.EqualTo(UnicodeCategory.SymbolCurrency));
}

[Test]
public void IndexCodepoint()
public void NearTop()
{
/*
for (int i = 0; i < CharInfo.Characters.Count; i++)
{
Assert.That(CharInfo.Characters[i].Codepoint, Is.EqualTo((uint)i));
}
*/
Assert.NotNull(CharInfo.For(0x100000));
Assert.NotNull(CharInfo.For(0x10FFFD));
}
}
}
7 changes: 5 additions & 2 deletions FastString/FastString.csproj
Original file line number Diff line number Diff line change
Expand Up @@ -40,10 +40,13 @@
<Compile Include="Utf8Writer.cs" />
<Compile Include="Splitter.cs" />
<Compile Include="Unicode\CharInfo.cs" />
<Compile Include="Unicode\CharInfo.RawData.cs" />
</ItemGroup>
<ItemGroup>
<Folder Include="Unicode\" />
</ItemGroup>
<ItemGroup>
<EmbeddedResource Include="Unicode\chardata" />
<EmbeddedResource Include="Unicode\charnames" />
</ItemGroup>
<Import Project="$(MSBuildBinPath)\Microsoft.CSharp.targets" />
</Project>
</Project>
Loading

0 comments on commit 795e20f

Please sign in to comment.