In [None]:
import os
import csv
from collections import Counter
import numpy as np
import matplotlib.pyplot as plt
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_extraction.text import CountVectorizer
import random
from sklearn.multiclass import OneVsRestClassifier
from sklearn.svm import SVC
import datetime
import pytz
import zipfile

In [None]:
def csv_to_list(path):
  original_rows = []
  with open(path, newline='') as myFile:
    reader = csv.reader(myFile)
    for row in reader:
      original_rows.append(row)
  return original_rows

def create_new_csv(path, new_rows):
  with open(path, 'w') as f:
    writer = csv.writer(f)
    writer.writerows(new_rows)

In [None]:
def get_train_for_user(all_users_vectorized, user, num_segments):
  # get num_segments segments from user
  train = all_users_vectorized[150 * user : 150 * user + 50]
  labels = [ user ] * len(train)
  # get X segments from other users
  for other_user in range(40):
    if other_user != user:
      line_index = random.randint(150 * other_user + 50, 150 * (other_user + 1) - num_segments - 1)
      other_user_train = all_users_vectorized[line_index : line_index + num_segments]
      #train += other_user_train
      train = np.concatenate((train, other_user_train), axis=0)
      labels += [ other_user ] * num_segments
  return train, np.array(labels)

In [None]:
def get_segments_from_user(word_list, user, num_segments, start_index, last_index):
  mini_corpus = []
  line_index = random.randint(start_index, last_index - num_segments)
  for i in range(line_index, line_index + num_segments):
    mini_corpus.append(' '.join(word_list[user][i*100:(i+1)*100]))
  return mini_corpus

In [None]:
def get_unique_keys(word_list):
  corpus = []
  for user_index in range(40):
    user = "User" + str(user_index)
    for i in range(0,150):
      # convert each segment to a sentence
      corpus.append(' '.join(word_list[user][i*100:(i+1)*100]))
  return corpus
  # now corpus holds all segments from all users. does it suppose to be that way?

In [None]:
def get_vectorized(corpus):
  pattern = "(?u)\\b[\\w-]+[\\w.]+\\b"
  #CountVectorizer(stop_words='english',token_pattern=pattern)
  vectorizer = CountVectorizer(token_pattern=pattern)
  X = vectorizer.fit_transform(corpus)
  return X.toarray()

In [None]:
def get_vectorized2(corpus):
  pattern = "(?u)\\b[\\w-]+[\\w.]+\\b"
  #CountVectorizer(stop_words='english',token_pattern=pattern)
  vectorizer = CountVectorizer(token_pattern=pattern,ngram_range=(2, 2))
  X = vectorizer.fit_transform(corpus)
  return X.toarray()

In [None]:
def get_vectorized3(corpus):
  pattern = "(?u)\\b[\\w-]+[\\w.]+\\b"
  #CountVectorizer(stop_words='english',token_pattern=pattern)
  vectorizer = CountVectorizer(token_pattern=pattern,ngram_range=(3, 3))
  X = vectorizer.fit_transform(corpus)
  return X.toarray()

In [None]:
def get_word_list_from_path(path):
  word_list = {}
  for filename in os.listdir(path):
    if filename != '.ipynb_checkpoints':
      with open(os.path.join(path, filename), 'r') as f:
        word_list[filename] = []
        for line in f:
          word_list[filename].append(line.rstrip('\n'))
  return word_list

In [None]:
def unzip(path_to_zip_file):
  with zipfile.ZipFile(path_to_zip_file, 'r') as zip_ref:
    zip_ref.extractall("./")
  
#unzip("/content/FraudedRawData.zip")

In [None]:
def calc_accuracy(original_rows, user, result):
  true_lables_counter = 0
  gen_counter = 0
  user_real_labels = original_rows[user + 1]
  for segment in range(51, len(user_real_labels)):
    gen_counter += 1
    if user_real_labels[segment] == result[segment]:
      true_lables_counter += 1
  return true_lables_counter/gen_counter

In [None]:
def get_new_try_name():
  timezone = pytz.timezone("Asia/Jerusalem")
  ct = datetime.datetime.now(tz=timezone)
  new_results_path = '/content/try_' + ct.strftime("%Y-%m-%d_%H:%M:%S") + ".csv"
  return new_results_path

In [None]:
def get_first_lines_from_orig(original_rows):
  new_rows = []
  for row_num in range(11):
    new_rows.append(original_rows[row_num])
  return new_rows

In [None]:
orig_results_path = '/content/‏‏challengeToFill_try.csv'
original_rows = csv_to_list(orig_results_path)

new_results_path = get_new_try_name()

new_rows = get_first_lines_from_orig(original_rows)

path = '/content/FraudedRawData'
word_list = get_word_list_from_path(path)
corpus = get_unique_keys(word_list)
all_users_vectorized = get_vectorized(corpus)

num_segments = 10

for user in range(40):
  print("Evaluating for User{}".format(user))
  train, labels = get_train_for_user(all_users_vectorized, user, num_segments)
  clf = OneVsRestClassifier(SVC()).fit(train, labels)
  result = clf.predict(all_users_vectorized[user * 150 + 50: (user + 1) * 150])
  result = [0 if item == user else 1 for item in result]
  result = [str(item) for item in result]
  result = [ '0' for item in range(50)] + result
  result = ["User" + str(user)] + result
  if user >= 10:
    new_rows.append(result)
  else:
    acc = calc_accuracy(original_rows, user, result)
    print("The accuracy for User{} is {}".format(user, acc))

create_new_csv(new_results_path, new_rows)

print("Full results are in:")
print(new_results_path)

word_list:
['cpp', 'sh', 'xrdb', 'mkpts', 'stty', 'cat', 'ksh', 'sed', 'cat', 'ksh', 'sed', 'whoami', 'hostname', 'whoami', 'hostname', 'env', 'ksh', 'ksh', 'userenv', 'wait4wm', 'xhost', 'xsetroot', 'reaper', 'cat', 'stty', 'cat', 'ksh', 'sed', 'cat', 'ksh', 'sed', 'whoami', 'hostname', 'whoami', 'hostname', 'whoami', 'hostname', 'ksh', 'whoami', 'hostname', 'whoami', 'hostname', 'ksh', 'whoami', 'hostname', 'popper', 'whoami', 'hostname', 'netstat', 'netscape', 'netscape', 'popper', 'popper', 'popper', 'popper', 'netscape', 'netscape', 'netscape', 'netscape', 'popper', 'netscape', 'popper', 'ksh', 'xterm', 'whoami', 'hostname', 'ksh', 'whoami', 'hostname', 'whoami', 'hostname', 'ksh', 'whoami', 'hostname', 'whoami', 'hostname', 'ksh', 'whoami', 'hostname', 'ksh', 'xterm', 'ls', 'ls', 'popper', 'popper', 'ls', 'ls', 'chmod', 'ls', 'ls', 'ls', 'touch', 'ls', 'rm', 'touch', 'ls', 'rlogin', 'rlogin', 'ksh', 'xterm', 'whoami', 'hostname', 'ksh', 'whoami', 'hostname', 'popper', 'netscape',