# Data Science Task

## Valerie task

In [1]:
import pandas as pd
import numpy as np
import random

In [26]:

def col_discretize(nrows):
    
    ages = np.random.randint(1,40, nrows)
    
    def age_filter(num):
        if num < 18:
            return "Under 18"
        elif num >= 18 and num < 26:
            return "18-25"
        elif num >= 26 and num < 35:
            return "26-34"
        else:
            return "35 and Above"
    
    new_ages = [age_filter(num) for num in ages]
    
    return pd.DataFrame({'age':ages}), pd.DataFrame({'age':new_ages})

def col_onehot(nrows):
    
    from random import choices 
  
    colors = ['brown', 'green', 'blue']
    lst = choices(colors,k=nrows)
    one_hot = pd.get_dummies(lst)
    
    return pd.DataFrame({'color':lst}), one_hot
    

def process_date(nrows):
    
    '''
    2019-03-22 00:00:00 --> 2019, 03, 22
    '''
    
    # if we want more rows, that would be a problem.. 
    
    from datetime import date, timedelta
    sdate = date(2019,3,1)   # start date
    edate = date(2019,3,1+nrows)   # end date
    dates = pd.date_range(sdate,edate-timedelta(days=1),freq='d').tolist()
    random.shuffle(dates)
    

    d_old = {'dates': dates}
    df = pd.DataFrame(data=d_old)
    df_temp = pd.to_datetime(df['dates'],format='%Y-%m-%d')
    
    d_new = {'year': df_temp.dt.year, 'month': df_temp.dt.month, 'day':df_temp.dt.day}
    df_new = pd.DataFrame(data=d_new)
         
    return df.reset_index(drop=True) , df_new.reset_index(drop=True) 
    #return df.sa(n=nrows, random_state=1), df_new.choices(n=nrows, random_state=1)


def round_number(nrows):
    
    '''
    round generated list to a randomly generated number of digits
    '''
    
    round_digits = np.random.randint(4)
    
    orig = np.random.rand(nrows)*10
    mod = orig.round(round_digits)
    
    return pd.DataFrame({'height':orig}), pd.DataFrame({'height':mod})

def col_drop(df):
    
    '''
    given df, randomly drop between 1-4 columns
    '''
    
    from random import sample
    num_cols_to_drop = np.random.randint(1,4)
    cols_to_drop = sample(df.columns.tolist(), num_cols_to_drop)

    return df.drop(columns=cols_to_drop)


def create_datasets(rand_seed):
    
    random.seed(rand_seed)
    np.random.seed(rand_seed) 
    
    nrows = 10
    
    fns_to_apply = ["col_discretize", "col_onehot", "process_date", "round_number"]
    
    orig_df = pd.DataFrame()
    new_df = pd.DataFrame()
    
    
    for fn in fns_to_apply:
        orig, new = globals()[fn](nrows)
        
        orig_df = pd.concat([orig_df, orig], axis=1)
        new_df = pd.concat([new_df, new], axis=1)
                
    
    new_df = col_drop(new_df)
    
    return orig_df, new_df

def evaluate_correctness(target_df, input_df):
    score = 0
    for col in target_df.columns:
        if col not in input_df.columns:
            score -=1 
        else:
            if np.all(target_df[col] != input_df[col]):
                score-=1
    return score

In [27]:
seed = 0
orig_df, new_df = create_datasets(seed)
evaluate_correctness(orig_df, new_df)
from tabulate import tabulate

#get the text form of each dataframe
txt = tabulate(orig_df.head(), headers = 'keys', tablefmt = 'psql')
txt_new = tabulate(new_df.head(), headers = 'keys', tablefmt = 'psql')
print(txt)
print(txt_new)


+----+-------+---------+---------------------+----------+
|    |   age | color   | dates               |   height |
|----+-------+---------+---------------------+----------|
|  0 |     1 | blue    | 2019-03-06 00:00:00 |  2.72656 |
|  1 |     4 | blue    | 2019-03-05 00:00:00 |  4.77665 |
|  2 |     4 | green   | 2019-03-10 00:00:00 |  8.12169 |
|  3 |    10 | brown   | 2019-03-07 00:00:00 |  4.79977 |
|  4 |    20 | green   | 2019-03-01 00:00:00 |  3.92785 |
+----+-------+---------+---------------------+----------+
+----+----------+--------+---------+---------+---------+-------+----------+
|    | age      |   blue |   brown |   green |   month |   day |   height |
|----+----------+--------+---------+---------+---------+-------+----------|
|  0 | Under 18 |      1 |       0 |       0 |       3 |     6 |        3 |
|  1 | Under 18 |      1 |       0 |       0 |       3 |     5 |        5 |
|  2 | Under 18 |      0 |       0 |       1 |       3 |    10 |        8 |
|  3 | Under 18 |     

In [28]:
import pandas as pd
from io import StringIO

# Original dataset
data = """
age,color,dates,height
1,blue,2019-03-06,2.72656
4,blue,2019-03-05,4.77665
4,green,2019-03-10,8.12169
10,brown,2019-03-07,4.79977
20,green,2019-03-01,3.92785
"""

# Read the dataset into a DataFrame
orig_df = pd.read_csv(StringIO(data))

df = orig_df.copy()
# Process the dataset to match the desired format
# 1. Convert 'age' to categorical data
df['age'] = pd.cut(df['age'], bins=[0, 18, 25, 100], labels=["Under 18", "18-25", "Over 25"], right=False)

# 2. Convert 'color' to one-hot encoding
color_dummies = pd.get_dummies(df['color'])
df = pd.concat([df, color_dummies], axis=1)

# 3. Extract 'month' and 'day' from 'dates'
df['dates'] = pd.to_datetime(df['dates'])
df['month'] = df['dates'].dt.month
df['day'] = df['dates'].dt.day

# 4. Round 'height' to nearest integer
df['height'] = df['height'].round().astype(int)

# 5. Drop the original 'color' and 'dates' columns
df.drop(['color', 'dates'], axis=1, inplace=True)

# Rearrange columns to match the desired format
df = df[['age', 'blue', 'brown', 'green', 'month', 'day', 'height']]

df
# convert to string
df_str = df.to_csv(index=False)
print(df_str)



age,blue,brown,green,month,day,height
Under 18,1,0,0,3,6,3
Under 18,1,0,0,3,5,5
Under 18,0,0,1,3,10,8
Under 18,0,1,0,3,7,5
18-25,0,0,1,3,1,4



In [33]:
import pandas as pd
from io import StringIO

# Original dataset
data = """
age,color,dates,height
1,blue,2019-03-06,2.72656
4,blue,2019-03-05,4.77665
4,green,2019-03-10,8.12169
10,brown,2019-03-07,4.79977
20,green,2019-03-01,3.92785
"""

# Read the dataset into a DataFrame
df = pd.read_csv(StringIO(data))
def transform_df(df):
    # Process the dataset to match the desired format
    # 1. Convert 'age' to categorical data
    df['age'] = pd.cut(df['age'], bins=[0, 18, 25, 100], labels=["Under 18", "18-25", "Over 25"], right=False)

    # 2. Convert 'color' to one-hot encoding
    color_dummies = pd.get_dummies(df['color'])
    df = pd.concat([df, color_dummies], axis=1)

    # 3. Extract 'month' and 'day' from 'dates'
    df['dates'] = pd.to_datetime(df['dates'])
    df['month'] = df['dates'].dt.month
    df['day'] = df['dates'].dt.day

    # 4. Round 'height' to nearest integer
    df['height'] = df['height'].round().astype(int)

    # 5. Drop the original 'color' and 'dates' columns
    df.drop(['color', 'dates'], axis=1, inplace=True)

    # Rearrange columns to match the desired format
    df = df[['age', 'blue', 'brown', 'green', 'month', 'day', 'height']]
    return df


In [35]:
data = """
age,color,dates,height
1,blue,2019-03-06,2.72656
4,blue,2019-03-05,4.77665
4,green,2019-03-10,8.12169
10,brown,2019-03-07,4.79977
20,green,2019-03-01,3.92785
"""
df = pd.read_csv(StringIO(data))
new_data_testing = """
age,blue,brown,green,month,day,height
Under 18,1,0,0,3,6,3
Under 18,1,0,0,3,5,5
Under 18,0,0,1,3,10,8
Under 18,0,1,0,3,7,5
18-25,0,0,1,3,1,4
"""
def evaluate_correctness(target_df, input_df):
    score = 0
    for col in target_df.columns:
        if col not in input_df.columns:
            score -=1 
        else:
            if np.all(target_df[col] != input_df[col]):
                score-=1
    return score
new_data_df_testing = pd.read_csv(StringIO(new_data_testing))
assert evaluate_correctness(new_data_df_testing, transform_df(df)) == 0

AssertionError: 

## Another Problem

In [1]:
import numpy as np
import pandas as pd
from tabulate import tabulate

# Define the number of rows
n_rows = 6

# Create the DataFrame with random data
np.random.seed(0)  # Set seed for reproducibility
df_random = pd.DataFrame({
    'col1': np.random.randint(1, 11, size=n_rows),
    'col2': np.random.randint(1, 11, size=n_rows),
    'col3': np.random.uniform(0, 10, size=n_rows),
    'col4': np.random.randint(1, 11, size=n_rows),
    'col5': np.random.randint(1, 11, size=n_rows)
})
# remove index column
df_random.reset_index(drop=True, inplace=True)

df_str = df_random.to_csv(index=False)
print(df_str)

col1,col2,col3,col4,col5
6,4,0.5671297731744318,10,4
1,6,2.726562945801132,9,6
4,3,4.776651173213499,10,1
4,5,8.121687287754932,5,3
8,8,4.799771723750573,4,4
10,7,3.9278479610082973,1,9



In [2]:
# target 
# Transform the DataFrame as requested

df_transformed = df_random.copy()

# col1 as the multiplication of col1 and col4
df_transformed['col1'] = df_transformed['col1'] * df_transformed['col4']

# col2 as col3 truncated to the nearest integer
df_transformed['col2'] = df_transformed['col3'].astype(int)

# col4 multiplied by 100
df_transformed['col4'] = df_transformed['col4'] * 100

# Remove col5
df_transformed.drop('col5', axis=1, inplace=True)

df_transformed

df_str = df_transformed.to_csv(index=False)
print(df_str)

col1,col2,col3,col4
60,0,0.5671297731744318,1000
9,2,2.726562945801132,900
40,4,4.776651173213499,1000
20,8,8.121687287754932,500
32,4,4.799771723750573,400
10,3,3.9278479610082973,100



In [3]:
from tabulate import tabulate

#get the text form of each dataframe
txt = tabulate(df_random, headers = 'keys', tablefmt = 'psql')
txt_new = tabulate(df_transformed, headers = 'keys', tablefmt = 'psql')
print(txt)
print(txt_new)

+----+--------+--------+---------+--------+--------+
|    |   col1 |   col2 |    col3 |   col4 |   col5 |
|----+--------+--------+---------+--------+--------|
|  0 |      6 |      4 | 0.56713 |     10 |      4 |
|  1 |      1 |      6 | 2.72656 |      9 |      6 |
|  2 |      4 |      3 | 4.77665 |     10 |      1 |
|  3 |      4 |      5 | 8.12169 |      5 |      3 |
|  4 |      8 |      8 | 4.79977 |      4 |      4 |
|  5 |     10 |      7 | 3.92785 |      1 |      9 |
+----+--------+--------+---------+--------+--------+
+----+--------+--------+---------+--------+
|    |   col1 |   col2 |    col3 |   col4 |
|----+--------+--------+---------+--------|
|  0 |     60 |      0 | 0.56713 |   1000 |
|  1 |      9 |      2 | 2.72656 |    900 |
|  2 |     40 |      4 | 4.77665 |   1000 |
|  3 |     20 |      8 | 8.12169 |    500 |
|  4 |     32 |      4 | 4.79977 |    400 |
|  5 |     10 |      3 | 3.92785 |    100 |
+----+--------+--------+---------+--------+


In [4]:
import pandas as pd
from io import StringIO

# Original dataset
data = """
col1,col2,col3,col4,col5
6,4,0.5671297731744318,10,4
1,6,2.726562945801132,9,6
4,3,4.776651173213499,10,1
4,5,8.121687287754932,5,3
8,8,4.799771723750573,4,4
10,7,3.9278479610082973,1,9
"""

# Read the dataset into a DataFrame
df = pd.read_csv(StringIO(data))
def transform_df(df):
    df_transformed = df.copy()

    # col1 as the multiplication of col1 and col4
    df_transformed['col1'] = df['col1'] * df['col4']

    # col2 as col3 truncated to the nearest integer
    df_transformed['col2'] = df['col3'].astype(int)

    # col4 multiplied by 100
    df_transformed['col4'] = df['col4'] * 100

    # Remove col5
    df_transformed.drop('col5', axis=1, inplace=True)
    return df_transformed

In [5]:
data = """
col1,col2,col3,col4,col5
6,4,0.5671297731744318,10,4
1,6,2.726562945801132,9,6
4,3,4.776651173213499,10,1
4,5,8.121687287754932,5,3
8,8,4.799771723750573,4,4
10,7,3.9278479610082973,1,9
"""
df = pd.read_csv(StringIO(data))
new_data_testing = """
col1,col2,col3,col4
60,0,0.5671297731744318,1000
9,2,2.726562945801132,900
40,4,4.776651173213499,1000
20,8,8.121687287754932,500
32,4,4.799771723750573,400
10,3,3.9278479610082973,100
"""
def evaluate_correctness(target_df, input_df):
    score = 0
    for col in target_df.columns:
        if col not in input_df.columns:
            score -=1 
        else:
            if np.all(target_df[col] != input_df[col]):
                score-=1
    for col in input_df.columns:
        if col not in target_df.columns:
            score -=1
    return score
new_data_df_testing = pd.read_csv(StringIO(new_data_testing))
assert evaluate_correctness(new_data_df_testing, transform_df(df)) == 0

### Another problem

In [12]:
import numpy as np
import pandas as pd
from tabulate import tabulate

# Define the number of rows
n_rows = 5

# Create the DataFrame with random data
np.random.seed(1)  # Set seed for reproducibility
df_random = pd.DataFrame({
    'col1': np.random.randint(1, 11, size=n_rows),
    'col2': np.random.randint(1, 11, size=n_rows),
    'col3': np.random.uniform(0, 10, size=n_rows),
    'col4': np.random.randint(1, 11, size=n_rows),
    'col5': np.random.randint(1, 11, size=n_rows)
})
# remove index column
df_random.index.name = None

df_str = df_random.to_csv(index=False)
print(df_str)

col1,col2,col3,col4,col5
6,1,5.3881673400335695,3,2
9,2,4.191945144032948,5,8
10,8,6.852195003967595,8,1
6,7,2.0445224973151745,8,7
1,10,8.781174363909454,10,10



In [14]:
def transform_df(df):
    df_transformed_requested = df.copy()
    df_transformed_requested['col1'] = df['col1'].cumsum()

    # col2 + 1
    df_transformed_requested['col2'] = df['col2'] + 1

    # col3 plus col4
    df_transformed_requested['col3'] = df['col3'] + df['col4']

    # Remove col4 and col5
    df_transformed_requested.drop(['col4', 'col5'], axis=1, inplace=True)

    # add two extra rows that are all zeros to the df
    df_transformed_requested = df_transformed_requested.append(pd.DataFrame([[0,0,0]], columns=df_transformed_requested.columns))
    df_transformed_requested = df_transformed_requested.append(pd.DataFrame([[0,0,0]], columns=df_transformed_requested.columns))
    return df_transformed_requested
transformed_df = transform_df(df_random)
df_str = transformed_df.to_csv(index=False)
print(df_str)

col1,col2,col3
6,2,8.388167340033569
15,3,9.191945144032948
25,9,14.852195003967594
31,8,10.044522497315175
32,11,18.781174363909454
0,0,0.0
0,0,0.0



  df_transformed_requested = df_transformed_requested.append(pd.DataFrame([[0,0,0]], columns=df_transformed_requested.columns))
  df_transformed_requested = df_transformed_requested.append(pd.DataFrame([[0,0,0]], columns=df_transformed_requested.columns))


In [15]:
from tabulate import tabulate

#get the text form of each dataframe
txt = tabulate(df_random, headers = 'keys', tablefmt = 'psql')
txt_new = tabulate(transformed_df, headers = 'keys', tablefmt = 'psql')
print(txt)
print(txt_new)

+----+--------+--------+---------+--------+--------+
|    |   col1 |   col2 |    col3 |   col4 |   col5 |
|----+--------+--------+---------+--------+--------|
|  0 |      6 |      1 | 5.38817 |      3 |      2 |
|  1 |      9 |      2 | 4.19195 |      5 |      8 |
|  2 |     10 |      8 | 6.8522  |      8 |      1 |
|  3 |      6 |      7 | 2.04452 |      8 |      7 |
|  4 |      1 |     10 | 8.78117 |     10 |     10 |
+----+--------+--------+---------+--------+--------+
+----+--------+--------+----------+
|    |   col1 |   col2 |     col3 |
|----+--------+--------+----------|
|  0 |      6 |      2 |  8.38817 |
|  1 |     15 |      3 |  9.19195 |
|  2 |     25 |      9 | 14.8522  |
|  3 |     31 |      8 | 10.0445  |
|  4 |     32 |     11 | 18.7812  |
|  0 |      0 |      0 |  0       |
|  0 |      0 |      0 |  0       |
+----+--------+--------+----------+


In [16]:
data = """
col1,col2,col3,col4,col5
6,1,5.3881673400335695,3,2
9,2,4.191945144032948,5,8
10,8,6.852195003967595,8,1
6,7,2.0445224973151745,8,7
1,10,8.781174363909454,10,10
"""
df = pd.read_csv(StringIO(data))
new_data_testing = """
col1,col2,col3
6,2,8.388167340033569
15,3,9.191945144032948
25,9,14.852195003967594
31,8,10.044522497315175
32,11,18.781174363909454
0,0,0.0
0,0,0.0
"""
def evaluate_correctness(target_df, input_df):
    # drop index column from both
    target_df.reset_index(drop=True, inplace=True)
    input_df.reset_index(drop=True, inplace=True)
    score = 0
    for col in target_df.columns:
        if col not in input_df.columns:
            score -=1 
        else:
            if np.all(target_df[col] != input_df[col]):
                score-=1
    for col in input_df.columns:
        if col not in target_df.columns:
            score -=1
    return score
new_data_df_testing = pd.read_csv(StringIO(new_data_testing))
assert evaluate_correctness(new_data_df_testing, transform_df(df)) == 0

  df_transformed_requested = df_transformed_requested.append(pd.DataFrame([[0,0,0]], columns=df_transformed_requested.columns))
  df_transformed_requested = df_transformed_requested.append(pd.DataFrame([[0,0,0]], columns=df_transformed_requested.columns))


# Editing Augment Fix


## Editing

## calculator

In [20]:
class Calculator:
    def __init__(self):
        # the calculator only keeps track of the current number
        self.current_number = 0
        # stores the previous operations performed
        self.previous_operations = []
    def add(self, a):
        '''
        a: real number
        '''
        # two lines below should not be changed
        self.previous_operations.append((a, "add"))
        self.current_number += a + 20
    
    def subtract(self, a):
        '''
        a: real number
        '''

        # two lines below should not be changed
        self.previous_operations.append((a, "subtract"))
        self.current_number =  self.current_number - a/10

    def multiply(self, a):
        '''
        a: real number
        '''

        # two lines below should not be changed
        self.previous_operations.append((a, "multiply"))
        self.current_number =  (self.current_number ** a ) / a

    def divide(self, a):
        '''
        a: positive integer
        '''

        # two lines below should not be changed
        self.previous_operations.append((a, "divide"))
        self.current_number =  self.current_number / a * 2

    def undo_last_operation(self):
        '''
        undoes the last operation performed and restors current_number to the value before the last operation
        '''
        last_operation = self.previous_operations.pop()
    
    def undo_last_k_operations(self, k):
        ''' 
        undoes the last k operations performed and restores current_number to the value before the last k operations
        Args:
            k (int): number of operations to undo
        '''
        for i in range(k):
            self.undo_last_operation()
          
'''
This is a special calculator that keeps track of the previous operations performed.
Note that the operations like add are not the standard ones

Your job is to fix some remaining bugs in the calculator class:
- you should not modify the behavior add, subtract, multiply, divide methods in terms of the calculation
- make sure that the add, subtract, multiply, divide methods only execute if the input is valid, if the input is not valid, the method should return without doing anything or raising errors
- fix the implementation of undo_last_operation by relying on the previous_operations list
'''

In [30]:

class Calculator:
    def __init__(self):
        # the calculator only keeps track of the current number
        self.current_number = 0
        # stores the previous operations performed
        self.previous_operations = []


    def add(self, a):
        '''
        a: real number
        '''
        if not isinstance(a, (int, float)):
            return
        # two lines below should not be changed
        self.previous_operations.append((a, "add"))
        self.current_number += a + 20
    
    def subtract(self, a):
        '''
        a: real number
        '''
        if not isinstance(a, (int, float)):
            return
        # two lines below should not be changed
        self.previous_operations.append((a, "subtract"))
        self.current_number =  self.current_number - a/10

    def multiply(self, a):
        '''
        a: real number
        '''
        if not isinstance(a, (int, float)) or a == 0:
            return
        # two lines below should not be changed
        self.previous_operations.append((a, "multiply"))
        self.current_number =  (self.current_number ** a ) / a

    def divide(self, a):
        '''
        a: positive integer
        '''
        if not isinstance(a, (int)) or a <= 0:
            return
        # two lines below should not be changed
        self.previous_operations.append((a, "divide"))
        self.current_number =  self.current_number / a * 2

    def undo_last_operation(self):
        '''
        undoes the last operation performed and restors current_number to the value before the last operation
        '''
        last_operation = self.previous_operations.pop()
        if last_operation[1] == "add":
            self.current_number -= last_operation[0] + 20
        elif last_operation[1] == "subtract":
            self.current_number += last_operation[0]/10
        elif last_operation[1] == "multiply":
            self.current_number =  (self.current_number * last_operation[0] ) ** (1/last_operation[0])
        elif last_operation[1] == "divide":
            self.current_number =  self.current_number * last_operation[0] / 2
        
    
    def undo_last_k_operations(self, k):
        ''' 
        undoes the last k operations performed and restores current_number to the value before the last k operations
        Args:
            k (int): number of operations to undo
        '''
        for i in range(k):
            self.undo_last_operation()
          
'''
This is a special calculator that keeps track of the previous operations performed.
Note that the operations like add are not the standard ones

Your job is to fix some remaining bugs in the calculator class:
- you should not modify the behavior add, subtract, multiply, divide methods in terms of the calculation
- make sure that the add, subtract, multiply, divide methods only execute if the input is valid, if the input is not valid, the method should return without doing anything or raising errors
- fix the implementation of undo_last_operation by relying on the previous_operations list
- do not rename any of the existing methods or variable names
'''

'\nThis is a special calculator that keeps track of the previous operations performed.\nNote that the operations like add are not the standard ones\n\nYour job is to fix some remaining bugs in the calculator class:\n- you should not modify the behavior add, subtract, multiply, divide methods in terms of the calculation\n- make sure that the add, subtract, multiply, divide methods only execute if the input is valid, if the input is not valid, the method should return without doing anything or raising errors\n- fix the implementation of undo_last_operation by relying on the previous_operations list\n'

In [38]:
calc = Calculator()
calc.add(5)
print(calc.current_number)
assert calc.current_number == 25
calc.add('a')
print(calc.current_number)
assert calc.current_number == 25
calc.subtract(2.2)
print(calc.current_number)
assert calc.current_number == 24.78
calc.multiply(0)
print(calc.current_number)
assert calc.current_number == 24.78
calc.multiply(2)
print(calc.current_number)
assert calc.current_number == 307.0242
calc.divide(-1)
print(calc.current_number)
assert calc.current_number == 307.0242
calc.undo_last_operation()
print(calc.current_number)
assert calc.current_number == 24.78
calc.undo_last_k_operations(2)
print(calc.current_number)
assert calc.current_number == 0.0

25
25
24.78
24.78
307.0242
307.0242
24.78
0.0


## tokenizer

In [1]:
from collections import Counter

class Tokenizer:
    def __init__(self, max_vocab_size=200):
        self.max_vocab_size = max_vocab_size
        self.word_to_id = {}
        self.id_to_word = {}

    def tokenize(self, text):
        # Split text into words by spaces
        return text.lower().split()

    def build_vocabulary(self, corpus):
        # to be implemented
        # Flatten the list of sentences into a list of words
        all_words = [word for sentence in corpus for word in self.tokenize(sentence)]

        # Count the frequency of each word
        word_freq = Counter(all_words)

        # Select the top 'max_vocab_size' words
        most_common_words = word_freq.most_common(self.max_vocab_size)

        # Assign an ID to each word
        self.word_to_id = {word: idx for idx, (word, _) in enumerate(most_common_words)}
        self.id_to_word = {idx: word for word, idx in self.word_to_id.items()}

    def get_word_id(self, word):
        # Retrieve the ID of a word, return None if the word is not in the vocabulary
        return self.word_to_id.get(word)

    def get_word_by_id(self, word_id):
        # Retrieve a word by its ID, return None if the ID is not in the vocabulary
        return self.id_to_word.get(word_id)


In [4]:
'''

Your goal is to implement the  build_vocabulary method in the Tokenizer class provided. 
A tokenizer is an object that converts words to numerical IDs.

Objective of build_vocabulary Method:

The method's primary goal is to create two dictionaries: self.word_to_id and self.id_to_word.
self.word_to_id should map each unique word in your corpus to a unique numerical identifier (ID).
self.id_to_word is the reverse mapping, where each unique ID corresponds to a word.
The method should only consider the most frequent words in the corpus, up to a limit specified by max_vocab_size.
'''


class Tokenizer:
    def __init__(self, max_vocab_size=200):
        self.max_vocab_size = max_vocab_size
        self.word_to_id = {}
        self.id_to_word = {}

    def tokenize(self, text):
        # do not change
        # Split text into words by spaces
        return text.lower().split()

    def build_vocabulary(self, corpus):
        '''
        corpus: a list of strings (string denotes a sentence composed of words seperated by spaces)
        '''
        # WRITE CODE HERE
        return 
    

    def get_word_id(self, word):
        # do not change
        # Retrieve the ID of a word, return None if the word is not in the vocabulary
        return self.word_to_id.get(word)

    def get_word_by_id(self, word_id):
        # do not change
        # Retrieve a word by its ID, return None if the ID is not in the vocabulary
        return self.id_to_word.get(word_id)


In [5]:
def test_tokenize():
    tokenizer = Tokenizer()
    assert tokenizer.tokenize("Hello world") == ["hello", "world"], "Tokenization failed"

def test_build_vocabulary_and_get_word_id():
    tokenizer = Tokenizer(max_vocab_size=2)
    corpus = ["hello world", "hello python", "hello world"]
    tokenizer.build_vocabulary(corpus)
    
    assert tokenizer.get_word_id("hello") is not None, "'hello' should be in the vocabulary"
    assert tokenizer.get_word_id("world") is not None, "'world' should be in the vocabulary"
    assert tokenizer.get_word_id("python") is None, "'python' should not be in the vocabulary due to max_vocab_size limit"

def test_get_word_by_id():
    tokenizer = Tokenizer(max_vocab_size=2)
    corpus = ["apple orange", "banana apple", "cherry banana"]
    tokenizer.build_vocabulary(corpus)
    
    apple_id = tokenizer.get_word_id("apple")
    assert tokenizer.get_word_by_id(apple_id) == "apple", "ID lookup for 'apple' failed"

    # Assuming 'cherry' is not in the top 2 words and therefore has no ID
    cherry_id = tokenizer.get_word_id("cherry")
    assert cherry_id is None, "'cherry' should not have an ID"
    assert tokenizer.get_word_by_id(cherry_id) is None, "ID lookup for a non-existent word should return None"

# Run the tests
test_tokenize()
test_build_vocabulary_and_get_word_id()
test_get_word_by_id()



AssertionError: 'hello' should be in the vocabulary

### Login authenticator

In [6]:
import hashlib

class LoginAuthenticator:
    def __init__(self):
        self.user_credentials = {}  # dictionary for username: hashed_password

    def _hash_password(self, password):
        """Helper method to hash a password."""
        return hashlib.sha256(password.encode()).hexdigest()

    def add_user(self, username, password):
        """Adds a new user if the username doesn't already exist."""
        if username in self.user_credentials:
            return False  # Username already exists
        self.user_credentials[username] = self._hash_password(password)
        return True

    def authenticate_user(self, username, password):
        """Checks if the given username and password are valid."""
        if username not in self.user_credentials:
            return False
        return self.user_credentials[username] == self._hash_password(password)

    def remove_user(self, username):
        """Removes a user from the system."""
        if username in self.user_credentials:
            del self.user_credentials[username]
            return True
        return False

    def change_password(self, username, old_password, new_password):
        """Changes the password for a user if the old password is correct."""
        if self.authenticate_user(username, old_password):
            self.user_credentials[username] = self._hash_password(new_password)
            return True
        return False


Your goal is to implement the LoginAuthenticator class. This class will be used to authenticate users of a system. 

To implement the methods of the LoginAuthenticator class, follow these instructions for each method:

_hash_password (Private Method):

Purpose: To create a hash of a given password.
Parameters: password (string).
Process: use any hashing tehnique you like
Return: The hashed password 

add_user Method:
Purpose: To add a new user to the system with a username and a password.
Parameters: username (string), password (string).
Process:
Check if the username already exists in self.user_credentials.
If it does, return False to indicate the username is already taken.
If not, hash the password using _hash_password method and store the username and hashed password in self.user_credentials.
Return: True if the user was successfully added, otherwise False.

remove_user Method:

Purpose: To remove a user from the system.
Parameters: username (string).
Process:
Check if the username exists in self.user_credentials.
If it does, delete the username entry from self.user_credentials.
Return: True if the user was successfully removed, otherwise False.


change_password Method:

Purpose: To change a user's password.
Parameters: username (string), old_password (string), new_password (string).
Process:
First, authenticate the user using the authenticate_user method with username and old_password.
If authentication is successful, hash the new_password and update the self.user_credentials with the new hashed password.
Return: True if the password was successfully changed, otherwise False.

In [8]:

class LoginAuthenticator:
    def __init__(self):
        # DO NOT CHANGE
        self.user_credentials = {}  # dictionary for username: hashed_password

    def _hash_password(self, password):
        # WRITE CODE HERE
        return

    def add_user(self, username, password):
        # WRITE CODE HERE
        return

    def authenticate_user(self, username, password):
        # DO NOT CHANGE
        """Checks if the given username and password are valid."""
        if username not in self.user_credentials:
            return False
        return self.user_credentials[username] == self._hash_password(password)

    def remove_user(self, username):
        # WRITE CODE HERE
        return

    def change_password(self, username, old_password, new_password):
        # WRITE CODE HERE
        return


In [9]:
# Assuming the LoginAuthenticator class is defined as previously provided

# Create an instance of the LoginAuthenticator
authenticator = LoginAuthenticator()

# Test adding new users
assert authenticator.add_user("user1", "password1") == True  # Should succeed
assert authenticator.add_user("user2", "password2") == True  # Should succeed
assert authenticator.add_user("user1", "new_password") == False  # Should fail, user1 already exists

# Test authenticating users
assert authenticator.authenticate_user("user1", "password1") == True  # Correct credentials
assert authenticator.authenticate_user("user1", "wrong_password") == False  # Wrong password
assert authenticator.authenticate_user("user3", "password") == False  # Non-existent user

# Test removing users
assert authenticator.remove_user("user1") == True  # Should succeed in removing user1
assert authenticator.remove_user("user1") == False  # user1 no longer exists
assert authenticator.remove_user("user3") == False  # user3 does not exist

# Test changing passwords
assert authenticator.change_password("user2", "password2", "newpass2") == True  # Should succeed
assert authenticator.authenticate_user("user2", "newpass2") == True  # New password should work
assert authenticator.change_password("user2", "password2", "anothernewpass") == False  # Old password no longer valid
assert authenticator.change_password("nonexistent_user", "pass", "newpass") == False  # Non-existent user

print("All tests passed!")


AssertionError: 

### T test


Your goal is to complete the function simplified_t_test. This function takes as input two arrays of numbers and will return a float value called t_test. 

The simplified_t_test is a statistical test that is used to compare the means of two populations. The value is computed as follows:

t_test =  abs ( (mean1 - mean2) / sqrt((variance1 / n1) + (variance2 / n2))  )

where mean1 and mean2 are the means of the two populations, variance1 and variance2 are the variances of the two populations with a modified denominator:
variance1 = sum((x - mean1)^2) / (n1 - 2)
variance2 = sum((x - mean2)^2) / (n2 - 2)

, and n1 and n2 are the number of samples in each population. Note this is not the ordinary t-test, but a simplified version of it.



In [None]:

# function signature
def simplified_t_test(sample1, sample2):
    """
    :param sample1: List or array of sample data (sample 1)
    :param sample2: List or array of sample data (sample 2)
    :return: simplified t-test statistic
    """
    t_test = 0
    # write your code here
    return t_test


In [33]:

# function signature
import numpy as np

def simplified_t_test(sample1, sample2):
    """
    :param sample1: List or array of sample data (sample 1)
    :param sample2: List or array of sample data (sample 2)
    :return: simplified t-test statistic
    """
    t_test = 0
    # write your code here
    mean1 = np.mean(sample1)
    mean2 = np.mean(sample2)
    # variance with modified denominator
    variance1 = np.var(sample1, ddof=2)
    variance2 = np.var(sample2, ddof=2)
    n1 = len(sample1)
    n2 = len(sample2)
    t_test = (mean1 - mean2) / np.sqrt(variance1/n1 + variance2/n2)
    return abs(t_test)


In [38]:
import numpy as np

# Test with known values
sample1 = [10, 20, 30, 40, 50]
sample2 = [30, 40, 50, 60, 70]
expected_t_stat = 1.7320508075688774  # This value should be pre-calculated
print(simplified_t_test(sample1, sample2))
assert np.isclose(simplified_t_test(sample1, sample2), expected_t_stat, atol=1e-3), "Test with known values failed"

# Test with identical samples
identical_sample = [1, 2, 3, 4, 5]
assert simplified_t_test(identical_sample, identical_sample) == 0, "Test with identical samples failed"


sample1 = [1,2,-1,3,4]
sample2 = [2,3,-2,4,5]
expected_t_stat = 0.35032452487268523
print(simplified_t_test(sample1, sample2))
assert np.isclose(simplified_t_test(sample1, sample2), expected_t_stat, atol=1e-3), "Test with known values failed"



Ttest_indResult(statistic=-2.0, pvalue=0.08051623795726257)
1.7320508075688774
Ttest_indResult(statistic=-2.0, pvalue=0.08051623795726257)
Ttest_indResult(statistic=0.0, pvalue=1.0)
Ttest_indResult(statistic=-0.40451991747794513, pvalue=0.6975372096030519)
0.35032452487268523
Ttest_indResult(statistic=-0.40451991747794513, pvalue=0.6975372096030519)


### Retreiver

Your task is to create a class called Retriever. This class will be used to retrieve similar vectors from a collection of vectors. You should follow the instructions below to complete this task.


Create an instance of the Retriever class by providing two arguments:
vectors: A numpy array of vectors you want to analyze.
k: An integer indicating the number of top similar vectors you want to retrieve.
Example:

from numpy import array
vectors = array([[1, 2], [3, 4], [5, 6]])
k = 2
retriever = Retriever(vectors, k)


Setting 'k' Value:

Use the set_k method to update the value of k (number of top vectors to retrieve).
This method takes a single integer argument.
The value of k should be between 1 and the total number of vectors. If not, then the method should do nothing (do not raise an error).
Example:
retriever.set_k(3)

Adding New Vectors:

Add additional vectors to your existing collection using the add_vectors method.
This method accepts a numpy array of new vectors to be added.
Example:

new_vectors = array([[7, 8], [9, 10]])
retriever.add_vectors(new_vectors)


Calculating Distances:

To calculate the distance between a query vector and all stored vectors, use the distance method.
This method takes a single numpy array representing the query vector.
It returns a numpy array of distances.
Example:


query_vector = array([1, 2])
distances = retriever.distance(query_vector)


Retrieving Top 'k' Similar Vectors:

Use the get_top_k_similar_vectors method to find the top 'k' vectors most similar to a given query vector.
This method takes a single numpy array as the query vector.
It returns a numpy array of the top 'k' similar vectors.

Example:

top_vectors = retriever.get_top_k_similar_vectors(query_vector)

Generating a Similarity Matrix:

To create a similarity matrix between multiple queries and the stored vectors, use the get_similarity_matrix method.
This method accepts a numpy array of query vectors.
It returns a 2D numpy array where each row corresponds to the distances between a query vector and all stored vectors.

Example:

query_vectors = array([[1, 2], [3, 4]])
similarity_matrix = retriever.get_similarity_matrix(query_vectors)


In [6]:

import numpy as np
class Retriever:
    def __init__(self, vectors, k):
        self.vectors = vectors
        self.k = k

    def set_k(self, k):
        if k > len(self.vectors) or k < 1:
            return
        self.k = k

    def add_vectors(self, new_vectors):
        self.vectors = np.concatenate((self.vectors, new_vectors))
        
    def distance(self, query):
        ''' 
        query: single numpy arrray
        return: inverse l2 distances from query to the vectors
        '''
        distances = np.linalg.norm(self.vectors - query, axis=1)
        return distances
    
    def get_top_k_similar_vectors(self, query):
        '''
        query: single numpy array
        return: top k similar vectors
        '''
        scores = self.distance(query)
        # np.argsort sorts in ascending order
        indices_top = np.argsort(scores)
        top_k_indices = indices_top[:self.k]
        return self.vectors[top_k_indices]
    
    def get_similarity_matrix(self, queries):
        '''
        queries: numpy array of query vectors
        return: similarity matrix of size (len(queries), len(self.vectors))
        '''
        similarity_matrix = []
        for query in queries:
            similarity_matrix.append(self.distance(query))
        return np.array(similarity_matrix)

In [15]:
import numpy as np

# Test Initialization
vectors = np.array([[1, 2], [3, 4], [5, 6]])
k = 2
retriever = Retriever(vectors, k)
assert (retriever.vectors == vectors).all() and retriever.k == k, "Initialization Failed"

# Test set_k Method
retriever.set_k(1)
assert retriever.k == 1, "set_k Method Failed"
retriever.set_k(0)  # Edge case
assert retriever.k == 1, "set_k Method Failed on Edge Case"

# Test add_vectors Method
new_vectors = np.array([[7, 8], [9, 10]])
retriever.add_vectors(new_vectors)
assert (retriever.vectors == np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])).all(), "add_vectors Method Failed"

# Test distance Method
query = np.array([1, 2])
distances = retriever.distance(query)
ground_truth_distances = np.array([0, 2.82842712, 5.65685425, 8.48528137, 11.3137085])
assert np.allclose(distances, ground_truth_distances, atol=1e-3), "distance Method Failed"
assert len(distances) == len(retriever.vectors), "distance Method Failed"

# Test get_top_k_similar_vectors Method
top_vectors = retriever.get_top_k_similar_vectors(query)
ground_truth_top_vectors = np.array([[1, 2]])
assert (top_vectors == ground_truth_top_vectors).all(), "get_top_k_similar_vectors Method Failed"
assert len(top_vectors) == retriever.k, "get_top_k_similar_vectors Method Failed"

# Test get_similarity_matrix Method
query_vectors = np.array([[1, 2], [3, 4]])
similarity_matrix = retriever.get_similarity_matrix(query_vectors)

assert similarity_matrix.shape == (len(query_vectors), len(retriever.vectors)), "get_similarity_matrix Method Failed"


[ 0.          2.82842712  5.65685425  8.48528137 11.3137085 ]


### event scheduler


Problem Description:

Input:

You have a list of events.
Each event is represented as a tuple (start, end, score).
start: The starting hour of the event (an integer between 0 and 10).
end: The ending hour of the event (an integer between start and 10).
score: The importance score of the event (a positive integer).
Constraints:

The events can only be scheduled between the hours of 0:00 and 10:00.
No two events can overlap. An event with an end time of X cannot overlap with another event with a start time of X.
Each event can be scheduled only once.
Objective:

Your goal is to schedule the events in such a way that the total importance score is maximized.
The algorithm should return the maximum total importance score that can be achieved with the given set of events.

Example:

Suppose you have the following list of events:

Event 1: (1, 3, 5)
Event 2: (1, 2, 3)
Event 3: (2, 3, 4)

Best schedule would be to pick Event 2 and Event 3, which would give a total importance score of 7.

The algorithm should determine the best way to schedule these events between 0:00 and 10:00 to achieve the highest total importance score, without any overlapping of events.

Output: The algorithm should return a single integer, which is the highest total importance score achievable under the given constraints.

In [20]:
test_events = [(1, 2, 10), (2,3,5), (1,3,14)]

def schedule_events(events):
    '''
    events is a list of tuples of the form (start_time, end_time, score)
    '''
    score = 0
    # write your code here

    return score

print(schedule_events(test_events))

0


In [24]:
#event scheduler

def binary_search(events, index):
    lo, hi = 0, index - 1
    while lo <= hi:
        mid = (lo + hi) // 2
        if events[mid][1] <= events[index][0]:
            if events[mid + 1][1] <= events[index][0]:
                lo = mid + 1
            else:
                return mid
        else:
            hi = mid - 1
    return -1

def schedule_events(events):
    # Sort the events based on their end time
    events.sort(key=lambda x: x[1])

    n = len(events)
    dp = [0] * n
    dp[0] = events[0][2]

    for i in range(1, n):
        incl_prof = events[i][2]
        l = binary_search(events, i)
        if l != -1:
            incl_prof += dp[l]

        dp[i] = max(incl_prof, dp[i - 1])

    return dp[n-1]



In [26]:

# Test Case 1: Single event
events = [(0, 2, 10)]
assert schedule_events(events) == 10, "Test Case 1 Failed"

# Test Case 2: Two non-overlapping events
events = [(0, 2, 10), (2, 4, 15)]
assert schedule_events(events) == 25, "Test Case 2 Failed"

# Test Case 3: Two overlapping events, one with higher score
events = [(0, 3, 10), (2, 5, 20)]
assert schedule_events(events) == 20, "Test Case 3 Failed"

# Test Case 4: Multiple events, some overlapping
events = [(0, 3, 10), (2, 5, 15), (5, 7, 20)]
assert schedule_events(events) == 35, "Test Case 4 Failed"

# Test Case 5: Events with the same time
events = [(1, 4, 10), (1, 4, 15)]
assert schedule_events(events) == 15, "Test Case 5 Failed"

# Test Case 6: Events spread throughout the day
events = [(0, 2, 10), (3, 5, 15), (6, 8, 20), (9, 10, 25)]
assert schedule_events(events) == 70, "Test Case 6 Failed"

# Test Case 7: Non-overlapping events with equal score
events = [(0, 2, 10), (2, 4, 10), (4, 6, 10)]
assert schedule_events(events) == 30, "Test Case 7 Failed"

# Test Case 8: Overlapping events with varying scores
events = [(0, 4, 20), (3, 5, 30), (5, 7, 25)]
assert schedule_events(events) == 55, "Test Case 8 Failed"

# Test Case 9: All events overlapping
events = [(1, 3, 10), (2, 4, 15), (2, 5, 20)]
assert schedule_events(events) == 20, "Test Case 9 Failed"


print("All test cases passed!")


All test cases passed!


### Graph class

# Create jsons

In [7]:
import json

# Define your details
name = "YourName"
task_description = """
"""
function_signature = """

"""

unit_test = """

"""
solution = """

"""
data_type = "data_manip"

data = {
    'name': name,
    'task_description': task_description,
    'function_signature': function_signature,
    'unit_test': unit_test,
    'solution': solution,
    'type': data_type
}

# Save the dictionary to a JSON file
with open("tasks/" + name+'.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)

In [12]:
import json

# Define your details
name = "retriever"
task_description = """
Your task is to create a class called Retriever. This class will be used to retrieve similar vectors from a collection of vectors. You should follow the instructions below to complete this task.


Create an instance of the Retriever class by providing two arguments:
vectors: A numpy array of vectors you want to analyze.
k: An integer indicating the number of top similar vectors you want to retrieve.
Example:

from numpy import array
vectors = array([[1, 2], [3, 4], [5, 6]])
k = 2
retriever = Retriever(vectors, k)


Setting 'k' Value:

Use the set_k method to update the value of k (number of top vectors to retrieve).
This method takes a single integer argument.
The value of k should be between 1 and the total number of vectors. If not, then the method should do nothing (do not raise an error).
Example:
retriever.set_k(3)

Adding New Vectors:

Add additional vectors to your existing collection using the add_vectors method.
This method accepts a numpy array of new vectors to be added.
Example:

new_vectors = array([[7, 8], [9, 10]])
retriever.add_vectors(new_vectors)


Calculating Distances:

To calculate the distance between a query vector and all stored vectors, use the distance method.
This method takes a single numpy array representing the query vector.
It returns a numpy array of distances.
Example:


query_vector = array([1, 2])
distances = retriever.distance(query_vector)


Retrieving Top 'k' Similar Vectors:

Use the get_top_k_similar_vectors method to find the top 'k' vectors most similar to a given query vector.
This method takes a single numpy array as the query vector.
It returns a numpy array of the top 'k' similar vectors.

Example:

top_vectors = retriever.get_top_k_similar_vectors(query_vector)

Generating a Similarity Matrix:

To create a similarity matrix between multiple queries and the stored vectors, use the get_similarity_matrix method.
This method accepts a numpy array of query vectors.
It returns a 2D numpy array where each row corresponds to the distances between a query vector and all stored vectors.

Example:

query_vectors = array([[1, 2], [3, 4]])
similarity_matrix = retriever.get_similarity_matrix(query_vectors)
"""
function_signature = """
class Retriever:
"""

unit_test = """
import numpy as np

# Test Initialization
vectors = np.array([[1, 2], [3, 4], [5, 6]])
k = 2
retriever = Retriever(vectors, k)
assert (retriever.vectors == vectors).all() and retriever.k == k, "Initialization Failed"

# Test set_k Method
retriever.set_k(1)
assert retriever.k == 1, "set_k Method Failed"
retriever.set_k(0)  # Edge case
assert retriever.k == 1, "set_k Method Failed on Edge Case"

# Test add_vectors Method
new_vectors = np.array([[7, 8], [9, 10]])
retriever.add_vectors(new_vectors)
assert (retriever.vectors == np.array([[1, 2], [3, 4], [5, 6], [7, 8], [9, 10]])).all(), "add_vectors Method Failed"

# Test distance Method
query = np.array([1, 2])
distances = retriever.distance(query)
ground_truth_distances = np.array([0, 2.82842712, 5.65685425, 8.48528137, 11.3137085])
assert np.allclose(distances, ground_truth_distances, atol=1e-3), "distance Method Failed"
assert len(distances) == len(retriever.vectors), "distance Method Failed"

# Test get_top_k_similar_vectors Method
top_vectors = retriever.get_top_k_similar_vectors(query)
ground_truth_top_vectors = np.array([[1, 2]])
assert (top_vectors == ground_truth_top_vectors).all(), "get_top_k_similar_vectors Method Failed"
assert len(top_vectors) == retriever.k, "get_top_k_similar_vectors Method Failed"

# Test get_similarity_matrix Method
query_vectors = np.array([[1, 2], [3, 4]])
similarity_matrix = retriever.get_similarity_matrix(query_vectors)

assert similarity_matrix.shape == (len(query_vectors), len(retriever.vectors)), "get_similarity_matrix Method Failed"
"""
solution = """
import numpy as np
class Retriever:
    def __init__(self, vectors, k):
        self.vectors = vectors
        self.k = k

    def set_k(self, k):
        if k > len(self.vectors) or k < 1:
            return
        self.k = k

    def add_vectors(self, new_vectors):
        self.vectors = np.concatenate((self.vectors, new_vectors))
        
    def distance(self, query):
        ''' 
        query: single numpy arrray
        return: inverse l2 distances from query to the vectors
        '''
        distances = np.linalg.norm(self.vectors - query, axis=1)
        return distances
    
    def get_top_k_similar_vectors(self, query):
        '''
        query: single numpy array
        return: top k similar vectors
        '''
        scores = self.distance(query)
        # np.argsort sorts in ascending order
        indices_top = np.argsort(scores)
        top_k_indices = indices_top[:self.k]
        return self.vectors[top_k_indices]
    
    def get_similarity_matrix(self, queries):
        '''
        queries: numpy array of query vectors
        return: similarity matrix of size (len(queries), len(self.vectors))
        '''
        similarity_matrix = []
        for query in queries:
            similarity_matrix.append(self.distance(query))
        return np.array(similarity_matrix)
"""
data_type = "lengthy_code"

data = {
    'name': name,
    'task_description': task_description,
    'function_signature': function_signature,
    'unit_test': unit_test,
    'solution': solution,
    'type': data_type
}

# Save the dictionary to a JSON file
with open("tasks/" + name+'.json', 'w') as json_file:
    json.dump(data, json_file, indent=4)