In [1]:
import datetime

In [2]:
first_data = '2022-03-24_2022-04-06/TA/data.tsv'
first_header = '2022-03-24_2022-04-06/TA/header_TA.tsv'
second_data = '2022-04-07_2022-04-13/TA/dataCommon.tsv'
second_header = '2022-04-07_2022-04-13/TA/headerCommon.tsv'

In [3]:
begin = datetime.date(2022, 3, 24)
end = datetime.date(2022, 4, 13)
delta = datetime.timedelta(days=1)
split_date = datetime.date(2022, 4, 7)

In [6]:
def generate_scope(d, next_d, dataname, headername):
    
    cdate = d.strftime("%Y%m%d")
    
    f = open('scopes_2/{}.script'.format(cdate), 'w')

    print(cdate[:4], cdate[4:6], cdate, d.strftime("%Y/%m/%d"), next_d.strftime("%Y/%m/%d"), dataname, headername)
    cur_scripts = '''//Script 
#DECLARE Origin_data_path string = "/shares/bingads.hm/local/users/Yunfan/train_val/{cdate}.tsv";
#DECLARE Origin_header_path string = "/shares/bingads.hm/local/users/Yunfan/train_val/header_{cdate}.tsv";
RESOURCE @Origin_header_path;
#DECLARE Output_path string = "/shares/bingads.hm/local/users/Yunfan/samples/{cdate}.tsv";
#DECLARE ExpiryDuration string = "30";

originData = 
    EXTRACT * 
FROM @Origin_data_path
USING TSVWithHeaderExtractor("-schemaFile", @Origin_header_path);

output =
    SELECT * FROM originData SAMPLE UNIFORM (0.06);

OUTPUT output
TO @Output_path
    WITH STREAMEXPIRY @ExpiryDuration;

'''.format(cdate=cdate) + \
    '''
#CS
using System;
using System.Collections.Generic;
using System.IO;
using System.Text;
using System.Text.RegularExpressions;
using System.Linq;
using ScopeRuntime;


public class RandomUtils
{
    /// <summary>
    /// Get a random UInt64
    /// </summary>
    /// <returns>Random UInt64</returns>
    public static double GetRandomConsistentDoubleFromRGUID(string RGUID)
    {
        double rn = 0.99;
        Guid gd = new Guid();

        try
        {
            gd = Guid.Parse(RGUID);
        }
        catch (Exception e)
        {
            return rn;
        }

        byte[] bytes = gd.ToByteArray();
        int seed = BitConverter.ToInt32(bytes, 0);
        double rp = new Random(seed).NextDouble();

        return rp;
    }


}


public class TSVWithHeaderExtractor : Extractor
{
    private bool initialized = false;
    private string delim = "\t";
    private string schemaFile = "";
    private string schemaString = "";

       public static string LegalizeColumnName(string name)
    {
        string legalized = name.Replace("DateTime", "DateTime_");
        legalized = Regex.Replace(legalized, @"(^m:)", "m_");
        legalized = Regex.Replace(legalized, @"(@)", "_atsymbol_");

        // replace the illegal char brought in by Category transform and Generic transform
        legalized = Regex.Replace(legalized, @"([^\w\d_]+)", "_");

        // and cut the length to 450 if it's too long. C# has a 512 char length limit to identifier(so does scope)
        legalized = legalized.Substring(0, Math.Min(450, legalized.Length)); // cap length to 450

        return legalized;
    }

    public void Initialize(string[] args)
    {
        if (initialized)
            return;

        // create and add 
        // Process args, if any
        for (Int32 i = 0; i < args.Length; i++)
        {
            switch (args[i])
            {
                case "-schemaFile":
                    if (i == args.Length - 1)
                        throw new ArgumentException("No value specified", "schemaFile");
                    schemaFile = Path.GetFileName(args[++i]); //only get file name

                    // load up schemafile
                    using (StreamReader sr = new StreamReader(schemaFile))//read header
                    {
                        schemaString = string.Join(",", sr.ReadLine().Trim().Split('\t').Select(t => LegalizeColumnName(t)).ToArray());
                    }
                    break;

                case "-delim":
                    if (i == args.Length - 1)
                        throw new ArgumentException("No value specified", "delim");
                    delim = args[++i];
                    break;
            }
        }

        initialized = true;
    }

    public override Schema GetOutputSchemaAtCompileTime(string[] requestedColumns, string[] args)
    {
        Initialize(args);
        return new Schema(schemaString);
    }

    public override IEnumerable<Row> Extract(StreamReader reader, Row outputRow, string[] args)
    {
        Initialize(args);

        string line;
        int count = 0;

        while ((line = reader.ReadLine()) != null)
        {
            count++;

            if (count == 1 && (line.StartsWith("m:", StringComparison.OrdinalIgnoreCase) || line.StartsWith("Feature_", StringComparison.OrdinalIgnoreCase)))//skip header
            {
                continue;
            }

            string[] tokens = line.Split(delim.ToCharArray());
            if (tokens.Length < outputRow.Count)
            {
                continue;
            }

            try
            {
                for (int i = 0; i < tokens.Length; ++i)
                {
                    outputRow[i].UnsafeSet(tokens[i]);
                }
            }
            catch(Exception)
            {
                continue;
            }

            yield return outputRow;
        }
    }
}
public class HeaderOutputter : Outputter
{
       public static string RevertLegalizeColumnName(string name)
    {
        string legalized = name.Replace("DateTime_", "DateTime");
        legalized = Regex.Replace(legalized, @"(^m_)", "m:");
        legalized = Regex.Replace(legalized, @"(_atsymbol_)", "@");
        return legalized;
    }

       public override void Output(RowSet input, StreamWriter writer, string[] args)
       {
              // output the header
              writer.WriteLine(string.Join("\t", input.Schema.ToString().Split(',').Select(c => RevertLegalizeColumnName(c.Split(':')[0])).ToArray()));
              writer.Flush();
       }
}
#ENDCS
'''
    f.write(cur_scripts)
    f.close()

In [7]:
d = begin
while d <= end:
    next_date = d + delta
    if d < split_date:
        generate_scope(d, next_date, first_data, first_header)
    else:
        generate_scope(d, next_date, second_data, second_header)
    d = next_date

2022 03 20220324 2022/03/24 2022/03/25 2022-03-24_2022-04-06/TA/data.tsv 2022-03-24_2022-04-06/TA/header_TA.tsv
2022 03 20220325 2022/03/25 2022/03/26 2022-03-24_2022-04-06/TA/data.tsv 2022-03-24_2022-04-06/TA/header_TA.tsv
2022 03 20220326 2022/03/26 2022/03/27 2022-03-24_2022-04-06/TA/data.tsv 2022-03-24_2022-04-06/TA/header_TA.tsv
2022 03 20220327 2022/03/27 2022/03/28 2022-03-24_2022-04-06/TA/data.tsv 2022-03-24_2022-04-06/TA/header_TA.tsv
2022 03 20220328 2022/03/28 2022/03/29 2022-03-24_2022-04-06/TA/data.tsv 2022-03-24_2022-04-06/TA/header_TA.tsv
2022 03 20220329 2022/03/29 2022/03/30 2022-03-24_2022-04-06/TA/data.tsv 2022-03-24_2022-04-06/TA/header_TA.tsv
2022 03 20220330 2022/03/30 2022/03/31 2022-03-24_2022-04-06/TA/data.tsv 2022-03-24_2022-04-06/TA/header_TA.tsv
2022 03 20220331 2022/03/31 2022/04/01 2022-03-24_2022-04-06/TA/data.tsv 2022-03-24_2022-04-06/TA/header_TA.tsv
2022 04 20220401 2022/04/01 2022/04/02 2022-03-24_2022-04-06/TA/data.tsv 2022-03-24_2022-04-06/TA/header