In [15]:
import datetime

In [16]:
begin = datetime.date(2022, 3, 24)
end = datetime.date(2022, 4, 13)
delta = datetime.timedelta(days=1)
split_date = datetime.date(2022, 4, 7)

In [17]:
def generate_scope(d):
    
    cdate = d.strftime("%Y%m%d")
    cdatef = d.strftime("%Y-%m-%d")
    f = open('scopes5/{}.script'.format(cdate), 'w')

    print(cdate, cdatef)
    cur_scripts = '''//Script 
#DECLARE Origin_data_path string = "/shares/bingads.hm/local/users/Yunfan/train_val/{cdate}.tsv";
#DECLARE Origin_header_path string = "/shares/bingads.hm/local/users/Yunfan/train_val/header.tsv";
RESOURCE @Origin_header_path;
#DECLARE OutputPath string = "/shares/bingads.hm/local/users/Yunfan/downsamples_20/{cdate}.tsv";
#DECLARE ExpiryDuration string = "30";
'''.format(cdate=cdate, cdatef=cdatef) + \
'''
originData = 
    EXTRACT * 
FROM @Origin_data_path
USING TSVWithHeaderExtractor("-schemaFile", @Origin_header_path);

TrainData =
    SELECT * FROM originData WHERE m_Click == "1"
    UNION ALL
    SELECT * FROM originData SAMPLE UNIFORM (0.195) WHERE m_Click == "0";
    
OUTPUT TrainData
TO @OutputPath
    WITH STREAMEXPIRY @ExpiryDuration;

#CS
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Linq;
using ScopeRuntime;

public class HeaderOutputter : Outputter
{
    public static string RevertLegalizeColumnName(string name)
    {
        string legalized = name.Replace("DateTime_", "DateTime");
        legalized = Regex.Replace(legalized, @"(^m_)", "m:");
        legalized = Regex.Replace(legalized, @"(_atsymbol_)", "@");
        return legalized;
    }

    public override void Output(RowSet input, StreamWriter writer, string[] args)
    {
        // output the header
        writer.WriteLine(string.Join("	", input.Schema.ToString().Split(',').Select(c => RevertLegalizeColumnName(c.Split(':')[0])).ToArray()));
        writer.Flush();
    }
}
public class TSVWithHeaderExtractor : Extractor
{
    private bool initialized = false;
    private string delim = "	";
    private string schemaFile = "";
    private string schemaString = "";
    
    public static string LegalizeColumnName(string name)
    {
        string legalized = name.Replace("DateTime", "DateTime_");
        legalized = Regex.Replace(legalized, @"(^m:)", "m_");
        legalized = Regex.Replace(legalized, @"(@)", "_atsymbol_");

        // replace the illegal char brought in by Category transform and Generic transform
        legalized = Regex.Replace(legalized, @"([^\w\d_]+)", "_");

        // and cut the length to 450 if it's too long. C# has a 512 char length limit to identifier(so does scope)
        legalized = legalized.Substring(0, Math.Min(450, legalized.Length)); // cap length to 450

        return legalized;
    }

    public void Initialize(string[] args)
    {
        if (initialized)
            return;

        // create and add 
        // Process args, if any
        for (Int32 i = 0; i < args.Length; i++)
        {
            switch (args[i])
            {
                case "-schemaFile":
                    if (i == args.Length - 1)
                        throw new ArgumentException("No value specified", "schemaFile");
                    schemaFile = Path.GetFileName(args[++i]); //only get file name

                    // load up schemafile
                    using (StreamReader sr = new StreamReader(schemaFile))//read header
                    {
                        schemaString = string.Join(",", sr.ReadLine().Trim().Split('	').Select(t => LegalizeColumnName(t)).ToArray());
                    }
                    break;

                case "-delim":
                    if (i == args.Length - 1)
                        throw new ArgumentException("No value specified", "delim");
                    delim = args[++i];
                    break;
            }
        }

        initialized = true;
    }

    public override Schema GetOutputSchemaAtCompileTime(string[] requestedColumns, string[] args)
    {
        Initialize(args);
        return new Schema(schemaString);
    }

    public override IEnumerable<Row> Extract(StreamReader reader, Row outputRow, string[] args)
    {
        Initialize(args);

        string line;
        int count = 0;
        
        while ((line = reader.ReadLine()) != null)
        {
            count++;

            if (count == 1 && (line.StartsWith("m:", StringComparison.OrdinalIgnoreCase) || line.StartsWith("Feature_", StringComparison.OrdinalIgnoreCase)))//skip header
            {
                continue;
            }

            string[] tokens = line.Split(delim.ToCharArray());
            if (tokens.Length < outputRow.Count)
            {
                continue;
            }
            
            try
            {
                for (int i = 0; i < tokens.Length; ++i)
                {
                    outputRow[i].UnsafeSet(tokens[i]);
                }
            }
            catch(Exception)
            {
                continue;
            }

            yield return outputRow;
        }
    }
}


public class ProcessEmbLineToVectors : Processor
{
    public override Schema Produces(string[] requestedColumns, string[] args, Schema input)
    {
        string EmbType = args[0];
        string feature_pre_name = args[1];

        string line = EmbType;
        for (int i=0; i<32; i++)
        {
            line += "," + feature_pre_name +  Convert.ToString(i) + ":int";
        }
        return new Schema(line);
    }
    public override IEnumerable<Row> Process(RowSet input, Row outputRow, string[] args)
    {
        string EmbType = args[0];
        string feature_pre_name = args[1];
        foreach (var inputRow in input.Rows)
        {
            outputRow[EmbType].UnsafeSet(inputRow[EmbType].String);
            
            for (var i = 0; i <= 31; ++i)
            {
                double v = Convert.ToDouble(inputRow[feature_pre_name + Convert.ToString(i)].String);
                v = v * 127 + 127;
                int h =  Convert.ToInt32(Math.Max(Math.Min(v, 255), 0));
                outputRow[feature_pre_name + Convert.ToString(i)].UnsafeSet(h);
            }
            yield return outputRow;
        }
    }
}


#ENDCS
'''
    f.write(cur_scripts)
    f.close()

In [18]:
d = begin
while d <= end:
    generate_scope(d)
    d = d + delta

20220324 2022-03-24
20220325 2022-03-25
20220326 2022-03-26
20220327 2022-03-27
20220328 2022-03-28
20220329 2022-03-29
20220330 2022-03-30
20220331 2022-03-31
20220401 2022-04-01
20220402 2022-04-02
20220403 2022-04-03
20220404 2022-04-04
20220405 2022-04-05
20220406 2022-04-06
20220407 2022-04-07
20220408 2022-04-08
20220409 2022-04-09
20220410 2022-04-10
20220411 2022-04-11
20220412 2022-04-12
20220413 2022-04-13
