# SecureFrame: Machine Learning Using Semi-Encrypted DataFrames

## Import Python Libraries

In [1]:
import random
import ast
import numpy as np
import pandas as pd
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn import metrics
from cryptography.fernet import Fernet

## Define SecureFrame Encryption Function

In [2]:
def SecureFrame(dataframe, ignore_cols=[''], ohe_cols=[''], augment_constant=False, encrypt_strings=False):

    '''A function which returns an encrypted DataFrame packet containing an encryption factor, encrypted column headers,
    and augmented DataFrame. The function also returns an encryption key which is used to decrypt the encryption
    packet.'''

    encryption_key = Fernet.generate_key() # Generate Fernet encryption key.
    cipher_suite = Fernet(encryption_key) # Initiate Fernet.

    dataframe_columns = list(dataframe.columns.values) # Create list containing DataFrame columns.
    factor = random.randint(7000, 13000) / 10000 # Generate random factor for encryption.

    encrypted_columns = cipher_suite.encrypt(str(dataframe_columns).encode("utf-8")) # Encrypt DataFrame columns.
    encrypted_factor = cipher_suite.encrypt(str(factor).encode("utf-8")) # Encrypt randomly generated factor.

    
    col_list = list(range(len(dataframe_columns))) # Generate list of temporary DataFrame columns.
    dataframe.columns = [str(x) for x in col_list]

    for col in dataframe.columns:
        try:
            if col in ignore_cols:
                dataframe[col] = dataframe[col] # Do not augment column if indices are present within ignore_cols argument.

            elif dataframe[col].nunique() == 1:
                if augment_constant == True:
                    dataframe[col] = dataframe[col].apply(lambda x: float(x) * factor) # Augment columns with only one value.
                else:
                    dataframe[col] = dataframe[col]
                    
            elif dataframe[col].isin([0, 1]).all(): # Ignore OHE columns.
                dataframe[col] = dataframe[col]
                
            else:
                dataframe[col] = dataframe[col].apply(lambda x: float(x) * factor) # Augment columns.

        except (TypeError, ValueError):
            if encrypt_strings == True:
                dataframe[col] = dataframe[col].apply(lambda x: cipher_suite.encrypt(str(x).encode("utf-8"))) # Encrypt strings.
            else:
                dataframe[col] = dataframe[col] # Do not augment column if it contains a string.
                

    encrypted_packet = {'encrypted_factor' : encrypted_factor,
                        'encrypted_columns' : encrypted_columns,
                        'augmented_dataframe' : dataframe,
                        'ignore_cols' : ignore_cols,
                        'ohe_cols' : ohe_cols}


    return encrypted_packet, encryption_key

## Define SecureFrame Decryption Function

In [3]:
def DecryptSecureFrame(encrypted_packet, encryption_key):

    '''A function which returns a decrypted DataFrame when passed an encrypted DataFrame with its associated
    encryption key.'''

    cipher_suite = Fernet(encryption_key)

    encrypted_factor = encrypted_packet['encrypted_factor']
    encrypted_columns = encrypted_packet['encrypted_columns']
    augmented_dataframe = encrypted_packet['augmented_dataframe']
    ignore_cols = encrypted_packet['ignore_cols']

    decrypted_factor = cipher_suite.decrypt(encrypted_factor).decode("utf-8")
    decrypted_columns = cipher_suite.decrypt(encrypted_columns).decode("utf-8")

    for col in augmented_dataframe.columns:
        try:
            if col in ignore_cols:
                augmented_dataframe[col] = augmented_dataframe[col]

            elif augmented_dataframe[col].nunique() == 1:
                augmented_dataframe[col] = augmented_dataframe[col]
                
            elif augmented_dataframe[col].isin([0, 1]).all(): # Ignore OHE columns.
                augmented_dataframe[col] = augmented_dataframe[col]

            else:
                augmented_dataframe[col] = augmented_dataframe[col].apply(lambda x: float(x) / float(decrypted_factor))

        except (TypeError, ValueError):
            try:
                augmented_dataframe[col] = augmented_dataframe[col].apply(lambda x: cipher_suite.decrypt(x).decode("utf-8"))
            except:
                augmented_dataframe[col] = augmented_dataframe[col]

    decrypted_dataframe = augmented_dataframe
    decrypted_dataframe.columns = ast.literal_eval(decrypted_columns)


    return decrypted_dataframe

## Demonstrating Transferring Between DataFrames and SecureFrames

#### Create Dummy DataFrame

In [4]:
test_data = pd.DataFrame([[43, 7, 3, 'Product 1', 0], [22, 5, 3, 'Product 2', 0], [27, 11, 4, 'Product 2', 1]], columns=['Fat', 'Protein', 'Fibre', 'Product', 'Result'])
print(test_data)

   Fat  Protein  Fibre    Product  Result
0   43        7      3  Product 1       0
1   22        5      3  Product 2       0
2   27       11      4  Product 2       1


#### Create SecureFrame Using DataFrame.

In [5]:
secure_frame = SecureFrame(test_data, encrypt_strings=True, ignore_cols=['']) # NOTE: encrypt_strings=True.
secure_dataframe = secure_frame[0]
encryption_key = secure_frame[1]
print(secure_dataframe['augmented_dataframe'])

         0        1       2  \
0  53.7715   8.7535  3.7515   
1  27.5110   6.2525  3.7515   
2  33.7635  13.7555  5.0020   

                                                   3  4  
0  b'gAAAAABdsKxFCe15_pjtreX47uWjHD7E-qWgJxV1LeWR...  0  
1  b'gAAAAABdsKxFpOTrhkiEBFt4C2zSnbDIq7KRVYFTp8j7...  0  
2  b'gAAAAABdsKxFeZFEJRLlgTbdsx_CAU9qrRuSDSMBgDlE...  1  


In [6]:
decrypted_secure_frame = DecryptSecureFrame(secure_dataframe, encryption_key)
print(decrypted_secure_frame)

    Fat  Protein  Fibre    Product  Result
0  43.0      7.0    3.0  Product 1       0
1  22.0      5.0    3.0  Product 2       0
2  27.0     11.0    4.0  Product 2       1


## Machine Learning Using SecureFrames

#### Load Iris Dataset

In [7]:
iris = load_iris()
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])
print(iris_df.head())

   sepal length (cm)  sepal width (cm)  petal length (cm)  petal width (cm)  \
0                5.1               3.5                1.4               0.2   
1                4.9               3.0                1.4               0.2   
2                4.7               3.2                1.3               0.2   
3                4.6               3.1                1.5               0.2   
4                5.0               3.6                1.4               0.2   

   target  
0     0.0  
1     0.0  
2     0.0  
3     0.0  
4     0.0  


#### Develop Random Forest Model Using Iris DataFrame

In [8]:
X = iris_df[['sepal length (cm)', 'sepal width (cm)', 'petal length (cm)', 'petal width (cm)']]  # Iris features.
y = iris_df['target']  # Iris targets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)


clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)


accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9333333333333333


#### Create SecureFrame Using Iris DataFrame

In [121]:
iris = load_iris()
iris_df = pd.DataFrame(data= np.c_[iris['data'], iris['target']], columns= iris['feature_names'] + ['target'])

secure_frame = SecureFrame(iris_df, ignore_cols=['4']) # NOTE: encrypt_strings=True.
secure_dataframe = secure_frame[0]
encryption_key = secure_frame[1]

secure_iris_df = secure_dataframe['augmented_dataframe']
print(secure_iris_df.sample(5))

           0        1        2        3    4
29   5.58078  3.79968  1.89984  0.23748  0.0
17   6.05574  4.15590  1.66236  0.35622  0.0
45   5.69952  3.56220  1.66236  0.35622  0.0
104  7.71810  3.56220  6.88692  2.61228  2.0
7    5.93700  4.03716  1.78110  0.23748  0.0


In [122]:
X = secure_iris_df[['0', '1', '2', '3']]  # Iris features.
y = secure_iris_df['4']  # Iris targets.
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=5)


clf = RandomForestClassifier(n_estimators=100, random_state=0)
clf.fit(X_train,y_train)
y_pred = clf.predict(X_test)


accuracy = metrics.accuracy_score(y_test, y_pred)
print("Accuracy:", accuracy)

Accuracy: 0.9333333333333333


In [None]:
#add comments
#start zoe thing