In [5]:
from tensorflow.keras.preprocessing.text import one_hot
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Embedding, SimpleRNN, Dense
from tensorflow.keras.optimizers import Adam
from sklearn.model_selection import train_test_split
import pandas as pd
import numpy as np
import boto3
from io import StringIO

In [6]:
session = boto3.Session(profile_name='ai-dataset-bucket-readwrite-795524854110')

# Connect to S3
s3 = session.client('s3')

bucket_name = 'artificial-intelligence-datasets'
object_key = 'input/PhishingMails/Ling.csv'  # e.g., 'project1/data/sample.csv'

# Download object into memory
response = s3.get_object(Bucket=bucket_name, Key=object_key)

with response['Body'] as stream:
    body = stream.read().decode('utf-8')

data = pd.read_csv(StringIO(body))

data.head()

Unnamed: 0,subject,body,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


In [7]:
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2859 entries, 0 to 2858
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  2797 non-null   object
 1   body     2859 non-null   object
 2   label    2859 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 67.1+ KB


In [8]:
data['subject'].fillna('', inplace=True)
data.info()

<class 'pandas.core.frame.DataFrame'>
RangeIndex: 2859 entries, 0 to 2858
Data columns (total 3 columns):
 #   Column   Non-Null Count  Dtype 
---  ------   --------------  ----- 
 0   subject  2859 non-null   object
 1   body     2859 non-null   object
 2   label    2859 non-null   int64 
dtypes: int64(1), object(2)
memory usage: 67.1+ KB


The behavior will change in pandas 3.0. This inplace method will never work because the intermediate object on which we are setting values always behaves as a copy.

For example, when doing 'df[col].method(value, inplace=True)', try using 'df.method({col: value}, inplace=True)' or df[col] = df[col].method(value) instead, to perform the operation inplace on the original object.


  data['subject'].fillna('', inplace=True)


In [9]:
data.head()

Unnamed: 0,subject,body,label
0,job posting - apple-iss research center,content - length : 3386 apple-iss research cen...,0
1,,"lang classification grimes , joseph e . and ba...",0
2,query : letter frequencies for text identifica...,i am posting this inquiry for sergei atamas ( ...,0
3,risk,a colleague and i are researching the differin...,0
4,request book information,earlier this morning i was on the phone with a...,0


### Demonstrating how one-hot representation can be obtained using the one_hot module

In [10]:
sentences = data.iloc[:9,0].values
sentences

array(['job posting - apple-iss research center', '',
       'query : letter frequencies for text identification', 'risk',
       'request book information',
       'call for abstracts : optimality in syntactic theory',
       'm . a . in scandinavian linguistics',
       'call for papers : linguistics session of the m / mla',
       'foreign language in commercials'], dtype=object)

In [11]:
vocab_size = 10000
one_hot_repr = [one_hot(sentence, vocab_size) for sentence in sentences]
one_hot_repr

[[6214, 3270, 4340, 7572, 9852, 124],
 [],
 [3530, 786, 5774, 9078, 7454, 9388],
 [6940],
 [1149, 6438, 9876],
 [3432, 9078, 7834, 7156, 2720, 8857, 6352],
 [2799, 404, 2720, 4155, 3518],
 [3432, 9078, 4341, 3518, 7098, 2449, 8482, 2799, 6421],
 [4066, 6117, 2720, 8644]]

### One Hot Representation of the entire data

In [12]:
data['subject'].values


array(['job posting - apple-iss research center', '',
       'query : letter frequencies for text identification', ...,
       "anglicization of composers ' names",
       're : 6 . 797 , comparative method : n - ary comparison',
       're : american - english in australia'], dtype=object)

### Train Test Split

In [13]:
X = data.loc[:, data.columns != 'label']
y = data['label']

In [14]:
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=42)
X_train.shape, X_test.shape, y_train.shape, y_test.shape

((2287, 2), (572, 2), (2287,), (572,))