# imports

In [1]:
import pandas as pd
import numpy as np
from sklearn import svm
import sys
import re
from copy import deepcopy
from datetime import datetime
import pickle
import glob
from sklearn.utils import shuffle
import os



# helper functions

In [2]:
def predict(svc_obj, df_in_test, df_target_test):
    predicted_result=svc_obj.predict(df_in_test)
    predicted_result_list = predicted_result.tolist()
    target_values_list = df_target_test.values.tolist()
    
    true_hotspot_count = np.sum(target_values_list)
    true_non_hotspot_count = len(target_values_list)-true_hotspot_count
    hotspot_hit=0
    non_hotspot_hit=0
    extra_fp=0
    extra_fn=0

    for i in range(len(predicted_result_list)):
        #accuracy - hotspot being called as a hotspot
        if (target_values_list[i] == 1) and (predicted_result_list[i] == 1):
            hotspot_hit = hotspot_hit +1
        #false positives - non-hotpsot pattern classified as a hotspot
        elif (target_values_list[i] == 0) and (predicted_result_list[i] == 1):
            extra_fp = extra_fp + 1
        #false negatives - a hotspot pattern classified as a non-hotspot    
        elif (target_values_list[i] == 1) and (predicted_result_list[i] == 0):
            extra_fn = extra_fn + 1
        #accuracy - a non-hotspot pattern classified as a non-hotspot    
        elif (target_values_list[i] == 0) and (predicted_result_list[i] == 0):
            non_hotspot_hit = non_hotspot_hit + 1
        else:
            print ('****** ERROR: unknown result values ******')
            sys.exit(0)
    result_print_str = ""
    result_print_str += "****** Results ******\n"
    result_print_str += "hotspot hit = {}\n".format(hotspot_hit)
    result_print_str += "non-hotspot hit = {}\n".format(non_hotspot_hit)
    result_print_str += "false positives = {}\n".format(extra_fp)
    result_print_str += "false negatives = {}\n".format(extra_fn)
    result_print_str += "Total hotspots in test set = {}\n".format(true_hotspot_count)
    result_print_str += "Total non-hotspots in test set = {}\n".format(true_non_hotspot_count)
    if true_hotspot_count!=0 :result_print_str += "hotspot hit rate = {}%\n".format(np.round((float(hotspot_hit)/true_hotspot_count)*100, 2))
    result_print_str +="false alarm rate = {}%\n".format(np.round((float(extra_fp)/true_non_hotspot_count)*100, 2))
    
    return result_print_str
    
    
    
    
def run_analysis(complete_training_df, complete_testing_df):
    if not test_only:
        train_df_name=complete_training_df.name
        #shuffle the input train df
        complete_training_df = shuffle(complete_training_df)
        if complete_training_df.isnull().values.any():
            print "**** Error: The training set has some Nan values **** - ", train_df_name 

        df_target_train=complete_training_df.pop("Litho_result")
        df_in_train=complete_training_df.copy() # already result has been removed

    df_target_test=complete_testing_df.pop("Litho_result")
    df_in_test=complete_testing_df.copy() # already result has been removed

    if not test_only:
        # fit the model
        print "Training started ---", datetime.now(), "--Training dataset size - ", df_in_train.shape, "-", train_df_name
        train_start_time = datetime.now()
        clf.fit(df_in_train, df_target_train)
        train_end_time = datetime.now()
        print "----Training Completed----", datetime.now(), "-", train_df_name

        #pickle the necessary items    
        if retrain_and_save_models:
            try:
                with open(model_path+"trained_model.pkl", 'wb') as f:  
                    pickle.dump([clf], f, protocol=-1)
                    print "---- Trained model saved at {} ----".format(model_path), datetime.now(), "-", train_df_name
            except Exception as e:
                sys.exit("*** ERROR: Unable to save pickle file,  with exception - {}".format(e))
            
            
    #Test set prediction           
    print "Test set prediction started --- test dataset size - ", df_in_test.shape
    testset_result_print_str = predict(clf, df_in_test, df_target_test)
    print "----Predict test set completed----", datetime.now()    
    print testset_result_print_str
    if not test_only:
        print 'Total training time: {}'.format(train_end_time - train_start_time), "-", train_df_name
    print 'Total run time: {}'.format(datetime.now() - prog_start_time)
            
            



Run main code
===


In [3]:
retrain_and_save_models = False # if True: train locally and save models at model_path. if False: pre-trained models from model_path will be used
model_path = "./trained_models/28nm/" #Use 32nm models for testing array_benchmark1, 28nm models for the rest
#all training datasets from the same node are combined and trained once, then, using the common trained model, testing datasets are tested individually
train_df_paths = ["./train_dataset/MX_Benchmark2_clip_named_.csv",\
                  "./train_dataset/MX_Benchmark3_clip_named_.csv", \
                  "./train_dataset/MX_Benchmark4_clip_named_.csv", \
                  "./train_dataset/MX_Benchmark5_clip_named_.csv"]
test_df_path = "./test_dataset/array_benchmark5_named_.csv"
            
#------------------------------ No user modifications needed below this line ------------------------

if retrain_and_save_models:
    if not os.path.exists(model_path):
        os.makedirs(model_path)
    test_only=False
else:
    test_only=True

prog_start_time = datetime.now()
global clf

if not test_only:
    # create training dataframes - Mx benchmarks
    train_df = pd.DataFrame()
    for path in train_df_paths:
        temp_df = pd.read_csv(path, header = 0, index_col = 0)
        train_df = train_df.append(temp_df, ignore_index=False)[temp_df.columns]

# create test dataframes - Array benchmarks
print "-- Testing Dataset used --", test_df_path
test_df = pd.read_csv(test_df_path, header = 0, index_col = 0)

if not test_only:
    complete_train_hotspot_df = train_df[train_df['Litho_result']==1]
    complete_train_non_hotspot_df = train_df[train_df['Litho_result']==0]
    complete_train_df = pd.concat([complete_train_hotspot_df, complete_train_non_hotspot_df], ignore_index=False)

complete_testing_df = test_df
print "complete_testing_df size - ", complete_testing_df.shape

if not test_only:
    #compute weights
    hotspot_count = len(complete_train_hotspot_df)
    non_hotspot_count = len(complete_train_non_hotspot_df)
    hotspot_weight = len(complete_train_df)/(2.0*hotspot_count) # n_samples/(n_clases*class_sample_count)
    non_hotspot_weight = len(complete_train_df)/(2.0*non_hotspot_count)
    norm_hotspot_weight = hotspot_weight/(hotspot_weight+non_hotspot_weight)
    norm_non_hotspot_weight = non_hotspot_weight/(hotspot_weight+non_hotspot_weight)
    
    C_val = 6 # obtained through cross-validation
    gamma_val=0.003 #obtained through cross validation

    clf = svm.SVC(C=C_val, degree=3, gamma=gamma_val, kernel='rbf', verbose=True,cache_size=40000, class_weight={0:norm_non_hotspot_weight+0.01, 1:norm_hotspot_weight+0.04}) #weight bias values obtained through cross-validation
    complete_train_df.name="train_df"
else: # use a pre-trained model
    try:
        with open(model_path+"trained_model.pkl", 'rb') as f:    
            clf = pickle.load(f)[0]
    except Exception as e:
        sys.exit("*** ERROR: Unable to read a pkl file, with exception - {}".format(e)) 

if test_only:
    complete_train_df = pd.DataFrame()

run_analysis(complete_train_df, complete_testing_df)



-- Testing Dataset used -- ./test_dataset/array_benchmark5_named_.csv
complete_testing_df size -  (2152, 1601)
Test set prediction started --- test dataset size -  (2152, 1600)
----Predict test set completed---- 2019-04-25 18:16:11.527186
****** Results ******
hotspot hit = 41
non-hotspot hit = 2107
false positives = 4
false negatives = 0
Total hotspots in test set = 41
Total non-hotspots in test set = 2111
hotspot hit rate = 100.0%
false alarm rate = 0.19%

Total run time: 0:00:09.982941
