# Data Science Task

## Valerie task

In [1]:
import pandas as pd
import numpy as np
import random

In [26]:

def col_discretize(nrows):
    
    ages = np.random.randint(1,40, nrows)
    
    def age_filter(num):
        if num < 18:
            return "Under 18"
        elif num >= 18 and num < 26:
            return "18-25"
        elif num >= 26 and num < 35:
            return "26-34"
        else:
            return "35 and Above"
    
    new_ages = [age_filter(num) for num in ages]
    
    return pd.DataFrame({'age':ages}), pd.DataFrame({'age':new_ages})

def col_onehot(nrows):
    
    from random import choices 
  
    colors = ['brown', 'green', 'blue']
    lst = choices(colors,k=nrows)
    one_hot = pd.get_dummies(lst)
    
    return pd.DataFrame({'color':lst}), one_hot
    

def process_date(nrows):
    
    '''
    2019-03-22 00:00:00 --> 2019, 03, 22
    '''
    
    # if we want more rows, that would be a problem.. 
    
    from datetime import date, timedelta
    sdate = date(2019,3,1)   # start date
    edate = date(2019,3,1+nrows)   # end date
    dates = pd.date_range(sdate,edate-timedelta(days=1),freq='d').tolist()
    random.shuffle(dates)
    

    d_old = {'dates': dates}
    df = pd.DataFrame(data=d_old)
    df_temp = pd.to_datetime(df['dates'],format='%Y-%m-%d')
    
    d_new = {'year': df_temp.dt.year, 'month': df_temp.dt.month, 'day':df_temp.dt.day}
    df_new = pd.DataFrame(data=d_new)
         
    return df.reset_index(drop=True) , df_new.reset_index(drop=True) 
    #return df.sa(n=nrows, random_state=1), df_new.choices(n=nrows, random_state=1)


def round_number(nrows):
    
    '''
    round generated list to a randomly generated number of digits
    '''
    
    round_digits = np.random.randint(4)
    
    orig = np.random.rand(nrows)*10
    mod = orig.round(round_digits)
    
    return pd.DataFrame({'height':orig}), pd.DataFrame({'height':mod})

def col_drop(df):
    
    '''
    given df, randomly drop between 1-4 columns
    '''
    
    from random import sample
    num_cols_to_drop = np.random.randint(1,4)
    cols_to_drop = sample(df.columns.tolist(), num_cols_to_drop)

    return df.drop(columns=cols_to_drop)


def create_datasets(rand_seed):
    
    random.seed(rand_seed)
    np.random.seed(rand_seed) 
    
    nrows = 10
    
    fns_to_apply = ["col_discretize", "col_onehot", "process_date", "round_number"]
    
    orig_df = pd.DataFrame()
    new_df = pd.DataFrame()
    
    
    for fn in fns_to_apply:
        orig, new = globals()[fn](nrows)
        
        orig_df = pd.concat([orig_df, orig], axis=1)
        new_df = pd.concat([new_df, new], axis=1)
                
    
    new_df = col_drop(new_df)
    
    return orig_df, new_df

def evaluate_correctness(target_df, input_df):
    score = 0
    for col in target_df.columns:
        if col not in input_df.columns:
            score -=1 
        else:
            if np.all(target_df[col] != input_df[col]):
                score-=1
    return score

In [27]:
seed = 0
orig_df, new_df = create_datasets(seed)
evaluate_correctness(orig_df, new_df)
from tabulate import tabulate

#get the text form of each dataframe
txt = tabulate(orig_df.head(), headers = 'keys', tablefmt = 'psql')
txt_new = tabulate(new_df.head(), headers = 'keys', tablefmt = 'psql')
print(txt)
print(txt_new)


+----+-------+---------+---------------------+----------+
|    |   age | color   | dates               |   height |
|----+-------+---------+---------------------+----------|
|  0 |     1 | blue    | 2019-03-06 00:00:00 |  2.72656 |
|  1 |     4 | blue    | 2019-03-05 00:00:00 |  4.77665 |
|  2 |     4 | green   | 2019-03-10 00:00:00 |  8.12169 |
|  3 |    10 | brown   | 2019-03-07 00:00:00 |  4.79977 |
|  4 |    20 | green   | 2019-03-01 00:00:00 |  3.92785 |
+----+-------+---------+---------------------+----------+
+----+----------+--------+---------+---------+---------+-------+----------+
|    | age      |   blue |   brown |   green |   month |   day |   height |
|----+----------+--------+---------+---------+---------+-------+----------|
|  0 | Under 18 |      1 |       0 |       0 |       3 |     6 |        3 |
|  1 | Under 18 |      1 |       0 |       0 |       3 |     5 |        5 |
|  2 | Under 18 |      0 |       0 |       1 |       3 |    10 |        8 |
|  3 | Under 18 |     

In [28]:
import pandas as pd
from io import StringIO

# Original dataset
data = """
age,color,dates,height
1,blue,2019-03-06,2.72656
4,blue,2019-03-05,4.77665
4,green,2019-03-10,8.12169
10,brown,2019-03-07,4.79977
20,green,2019-03-01,3.92785
"""

# Read the dataset into a DataFrame
orig_df = pd.read_csv(StringIO(data))

df = orig_df.copy()
# Process the dataset to match the desired format
# 1. Convert 'age' to categorical data
df['age'] = pd.cut(df['age'], bins=[0, 18, 25, 100], labels=["Under 18", "18-25", "Over 25"], right=False)

# 2. Convert 'color' to one-hot encoding
color_dummies = pd.get_dummies(df['color'])
df = pd.concat([df, color_dummies], axis=1)

# 3. Extract 'month' and 'day' from 'dates'
df['dates'] = pd.to_datetime(df['dates'])
df['month'] = df['dates'].dt.month
df['day'] = df['dates'].dt.day

# 4. Round 'height' to nearest integer
df['height'] = df['height'].round().astype(int)

# 5. Drop the original 'color' and 'dates' columns
df.drop(['color', 'dates'], axis=1, inplace=True)

# Rearrange columns to match the desired format
df = df[['age', 'blue', 'brown', 'green', 'month', 'day', 'height']]

df
# convert to string
df_str = df.to_csv(index=False)
print(df_str)



age,blue,brown,green,month,day,height
Under 18,1,0,0,3,6,3
Under 18,1,0,0,3,5,5
Under 18,0,0,1,3,10,8
Under 18,0,1,0,3,7,5
18-25,0,0,1,3,1,4



In [33]:
import pandas as pd
from io import StringIO

# Original dataset
data = """
age,color,dates,height
1,blue,2019-03-06,2.72656
4,blue,2019-03-05,4.77665
4,green,2019-03-10,8.12169
10,brown,2019-03-07,4.79977
20,green,2019-03-01,3.92785
"""

# Read the dataset into a DataFrame
df = pd.read_csv(StringIO(data))
def transform_df(df):
    # Process the dataset to match the desired format
    # 1. Convert 'age' to categorical data
    df['age'] = pd.cut(df['age'], bins=[0, 18, 25, 100], labels=["Under 18", "18-25", "Over 25"], right=False)

    # 2. Convert 'color' to one-hot encoding
    color_dummies = pd.get_dummies(df['color'])
    df = pd.concat([df, color_dummies], axis=1)

    # 3. Extract 'month' and 'day' from 'dates'
    df['dates'] = pd.to_datetime(df['dates'])
    df['month'] = df['dates'].dt.month
    df['day'] = df['dates'].dt.day

    # 4. Round 'height' to nearest integer
    df['height'] = df['height'].round().astype(int)

    # 5. Drop the original 'color' and 'dates' columns
    df.drop(['color', 'dates'], axis=1, inplace=True)

    # Rearrange columns to match the desired format
    df = df[['age', 'blue', 'brown', 'green', 'month', 'day', 'height']]
    return df


In [35]:
data = """
age,color,dates,height
1,blue,2019-03-06,2.72656
4,blue,2019-03-05,4.77665
4,green,2019-03-10,8.12169
10,brown,2019-03-07,4.79977
20,green,2019-03-01,3.92785
"""
df = pd.read_csv(StringIO(data))
new_data_testing = """
age,blue,brown,green,month,day,height
Under 18,1,0,0,3,6,3
Under 18,1,0,0,3,5,5
Under 18,0,0,1,3,10,8
Under 18,0,1,0,3,7,5
18-25,0,0,1,3,1,4
"""
def evaluate_correctness(target_df, input_df):
    score = 0
    for col in target_df.columns:
        if col not in input_df.columns:
            score -=1 
        else:
            if np.all(target_df[col] != input_df[col]):
                score-=1
    return score
new_data_df_testing = pd.read_csv(StringIO(new_data_testing))
assert evaluate_correctness(new_data_df_testing, transform_df(df)) == 0

AssertionError: 

## Another Problem

In [90]:
import numpy as np
import pandas as pd
from tabulate import tabulate

# Define the number of rows
n_rows = 6

# Create the DataFrame with random data
np.random.seed(0)  # Set seed for reproducibility
df_random = pd.DataFrame({
    'col1': np.random.randint(1, 11, size=n_rows),
    'col2': np.random.randint(1, 11, size=n_rows),
    'col3': np.random.uniform(0, 10, size=n_rows),
    'col4': np.random.randint(1, 11, size=n_rows),
    'col5': np.random.randint(1, 11, size=n_rows)
})
# remove index column
df_random.reset_index(drop=True, inplace=True)

df_str = df_random.to_csv(index=False)
print(df_str)

col1,col2,col3,col4,col5
6,4,0.5671297731744318,10,4
1,6,2.726562945801132,9,6
4,3,4.776651173213499,10,1
4,5,8.121687287754932,5,3
8,8,4.799771723750573,4,4
10,7,3.9278479610082973,1,9



In [62]:
# target 
# Transform the DataFrame as requested

df_transformed = df_random.copy()

# col1 as the multiplication of col1 and col4
df_transformed['col1'] = df_transformed['col1'] * df_transformed['col4']

# col2 as col3 truncated to the nearest integer
df_transformed['col2'] = df_transformed['col3'].round().astype(int)

# col4 multiplied by 100
df_transformed['col4'] = df_transformed['col4'] * 100

# Remove col5
df_transformed.drop('col5', axis=1, inplace=True)

df_transformed

df_str = df_transformed.to_csv(index=False)
print(df_str)

col1,col2,col3,col4
60,1,0.5671297731744318,1000
9,3,2.726562945801132,900
40,5,4.776651173213499,1000
20,8,8.121687287754932,500
32,5,4.799771723750573,400
10,4,3.9278479610082973,100



In [91]:
from tabulate import tabulate

#get the text form of each dataframe
txt = tabulate(df_random, headers = 'keys', tablefmt = 'psql')
txt_new = tabulate(df_transformed, headers = 'keys', tablefmt = 'psql')
print(txt)
print(txt_new)

+----+--------+--------+---------+--------+--------+
|    |   col1 |   col2 |    col3 |   col4 |   col5 |
|----+--------+--------+---------+--------+--------|
|  0 |      6 |      4 | 0.56713 |     10 |      4 |
|  1 |      1 |      6 | 2.72656 |      9 |      6 |
|  2 |      4 |      3 | 4.77665 |     10 |      1 |
|  3 |      4 |      5 | 8.12169 |      5 |      3 |
|  4 |      8 |      8 | 4.79977 |      4 |      4 |
|  5 |     10 |      7 | 3.92785 |      1 |      9 |
+----+--------+--------+---------+--------+--------+
+----+--------+--------+---------+--------+
|    |   col1 |   col2 |    col3 |   col4 |
|----+--------+--------+---------+--------|
|  0 |     60 |      1 | 0.56713 |   1000 |
|  1 |      9 |      3 | 2.72656 |    900 |
|  2 |     40 |      5 | 4.77665 |   1000 |
|  3 |     20 |      8 | 8.12169 |    500 |
|  4 |     32 |      5 | 4.79977 |    400 |
|  5 |     10 |      4 | 3.92785 |    100 |
+----+--------+--------+---------+--------+


In [80]:
import pandas as pd
from io import StringIO

# Original dataset
data = """
col1,col2,col3,col4,col5
6,4,0.5671297731744318,10,4
1,6,2.726562945801132,9,6
4,3,4.776651173213499,10,1
4,5,8.121687287754932,5,3
8,8,4.799771723750573,4,4
10,7,3.9278479610082973,1,9
"""

# Read the dataset into a DataFrame
df = pd.read_csv(StringIO(data))
def transform_df(df):
    df_transformed = df.copy()

    # col1 as the multiplication of col1 and col4
    df_transformed['col1'] = df['col1'] * df['col4']

    # col2 as col3 truncated to the nearest integer
    df_transformed['col2'] = df['col3'].round().astype(int)

    # col4 multiplied by 100
    df_transformed['col4'] = df['col4'] * 100

    # Remove col5
    df_transformed.drop('col5', axis=1, inplace=True)
    return df_transformed

In [81]:
data = """
col1,col2,col3,col4,col5
6,4,0.5671297731744318,10,4
1,6,2.726562945801132,9,6
4,3,4.776651173213499,10,1
4,5,8.121687287754932,5,3
8,8,4.799771723750573,4,4
10,7,3.9278479610082973,1,9
"""
df = pd.read_csv(StringIO(data))
new_data_testing = """
col1,col2,col3,col4
60,1,0.5671297731744318,1000
9,3,2.726562945801132,900
40,5,4.776651173213499,1000
20,8,8.121687287754932,500
32,5,4.799771723750573,400
10,4,3.9278479610082973,100
"""
def evaluate_correctness(target_df, input_df):
    score = 0
    for col in target_df.columns:
        if col not in input_df.columns:
            score -=1 
        else:
            if np.all(target_df[col] != input_df[col]):
                score-=1
    for col in input_df.columns:
        if col not in target_df.columns:
            score -=1
    return score
new_data_df_testing = pd.read_csv(StringIO(new_data_testing))
assert evaluate_correctness(new_data_df_testing, transform_df(df)) == 0

### Another problem

In [93]:
import numpy as np
import pandas as pd
from tabulate import tabulate

# Define the number of rows
n_rows = 6

# Create the DataFrame with random data
np.random.seed(1)  # Set seed for reproducibility
df_random = pd.DataFrame({
    'col1': np.random.randint(1, 11, size=n_rows),
    'col2': np.random.randint(1, 11, size=n_rows),
    'col3': np.random.uniform(0, 10, size=n_rows),
    'col4': np.random.randint(1, 11, size=n_rows),
    'col5': np.random.randint(1, 11, size=n_rows)
})
# remove index column
df_random.index.name = None

df_str = df_random.to_csv(index=False)
print(df_str)

col1,col2,col3,col4,col5
6,2,4.191945144032948,8,7
9,8,6.852195003967595,8,10
10,7,2.0445224973151745,10,10
6,10,8.781174363909454,2,8
1,3,0.27387593197926163,8,7
1,5,6.704675101784022,1,10



In [94]:
def transform_df(df):
    df_transformed_requested = df.copy()
    df_transformed_requested['col1'] = df['col1'].cumsum()

    # col2 minus the average of col2
    average_col2 = df['col2'].mean()
    df_transformed_requested['col2'] = df['col2'] - average_col2

    # col3 minus col4
    df_transformed_requested['col3'] = df['col3'] - df['col4']

    # Remove col4 and col5
    df_transformed_requested.drop(['col4', 'col5'], axis=1, inplace=True)
    return df_transformed_requested
transformed_df = transform_df(df_random)
df_str = transformed_df.to_csv(index=False)
print(df_str)

col1,col2,col3
6,-3.833333333333333,-3.808054855967052
15,2.166666666666667,-1.1478049960324048
25,1.166666666666667,-7.9554775026848255
31,4.166666666666667,6.781174363909454
32,-2.833333333333333,-7.726124068020738
33,-0.833333333333333,5.704675101784022



In [95]:
from tabulate import tabulate

#get the text form of each dataframe
txt = tabulate(df_random, headers = 'keys', tablefmt = 'psql')
txt_new = tabulate(transformed_df, headers = 'keys', tablefmt = 'psql')
print(txt)
print(txt_new)

+----+--------+--------+----------+--------+--------+
|    |   col1 |   col2 |     col3 |   col4 |   col5 |
|----+--------+--------+----------+--------+--------|
|  0 |      6 |      2 | 4.19195  |      8 |      7 |
|  1 |      9 |      8 | 6.8522   |      8 |     10 |
|  2 |     10 |      7 | 2.04452  |     10 |     10 |
|  3 |      6 |     10 | 8.78117  |      2 |      8 |
|  4 |      1 |      3 | 0.273876 |      8 |      7 |
|  5 |      1 |      5 | 6.70468  |      1 |     10 |
+----+--------+--------+----------+--------+--------+
+----+--------+-----------+----------+
|    |   col1 |      col2 |     col3 |
|----+--------+-----------+----------|
|  0 |      6 | -3.83333  | -3.80805 |
|  1 |     15 |  2.16667  | -1.1478  |
|  2 |     25 |  1.16667  | -7.95548 |
|  3 |     31 |  4.16667  |  6.78117 |
|  4 |     32 | -2.83333  | -7.72612 |
|  5 |     33 | -0.833333 |  5.70468 |
+----+--------+-----------+----------+


In [97]:
data = """
col1,col2,col3,col4,col5
6,2,4.191945144032948,8,7
9,8,6.852195003967595,8,10
10,7,2.0445224973151745,10,10
6,10,8.781174363909454,2,8
1,3,0.27387593197926163,8,7
1,5,6.704675101784022,1,10
"""
df = pd.read_csv(StringIO(data))
new_data_testing = """
col1,col2,col3
6,-3.833333333333333,-3.808054855967052
15,2.166666666666667,-1.1478049960324048
25,1.166666666666667,-7.9554775026848255
31,4.166666666666667,6.781174363909454
32,-2.833333333333333,-7.726124068020738
33,-0.833333333333333,5.704675101784022
"""
def evaluate_correctness(target_df, input_df):
    # drop index column from both
    target_df.reset_index(drop=True, inplace=True)
    input_df.reset_index(drop=True, inplace=True)
    score = 0
    for col in target_df.columns:
        if col not in input_df.columns:
            score -=1 
        else:
            if np.all(target_df[col] != input_df[col]):
                score-=1
    for col in input_df.columns:
        if col not in target_df.columns:
            score -=1
    return score
new_data_df_testing = pd.read_csv(StringIO(new_data_testing))
assert evaluate_correctness(new_data_df_testing, transform_df(df)) == 0