### 01. Import Dependecies

In [21]:
import pandas as pd

### 02. Loading Data

In [22]:
df = pd.read_csv("data/processed/handled_missing_values.csv")
df.head()

Unnamed: 0,user_id,signup_time,purchase_time,purchase_value,device_id,source,browser,sex,age,ip_address,class
0,22058,2015-02-24 22:55:49,2015-04-18 02:47:11,34,QVPSPJUOCKZAR,SEO,Chrome,M,39,732758400.0,0
1,333320,2015-06-07 20:39:50,2015-06-08 01:38:54,16,EOGFQPIZPYXFZ,Ads,Chrome,F,53,350311400.0,0
2,1359,2015-01-01 18:52:44,2015-01-01 18:52:45,15,YSSKYOSJHPPLJ,SEO,Opera,M,53,2621474000.0,1
3,150084,2015-04-28 21:13:25,2015-05-04 13:54:50,44,ATGTXKYKUDUQN,SEO,Safari,M,41,3840542000.0,0
4,221365,2015-07-21 07:09:52,2015-09-09 18:40:53,39,NAUITBZFJKHWW,Ads,Safari,M,45,415583100.0,0


In [23]:
df_ip = pd.read_csv('data/raw/IpAddress_to_Country.csv')
df_ip.head()

Unnamed: 0,lower_bound_ip_address,upper_bound_ip_address,country
0,16777216.0,16777471,Australia
1,16777472.0,16777727,China
2,16777728.0,16778239,China
3,16778240.0,16779263,Australia
4,16779264.0,16781311,China


### 03. Feature Engineering

#### 3.1 account_age_minutes

In [24]:
df['signup_time'] = pd.to_datetime(df['signup_time'])
df['purchase_time'] = pd.to_datetime(df['purchase_time'])
df['account_age_minutes'] = (df['purchase_time'] - df['signup_time']).dt.total_seconds() / 60

#### 3.2 device_count

In [25]:
df['device_count'] = df.groupby('device_id')['device_id'].transform('count')

#### 3.3 user_count_per_device

In [26]:
df['user_count_per_device'] = df.groupby('device_id')['user_id'].transform('nunique')

#### 3.4 map_ip_to_country

In [27]:
# Map IP addresses to countries based on IP ranges
def map_ip_to_country(ip_address, df_ip):
    """
    Map an IP address to a country based on IP address ranges.
    Returns the country if IP falls within a range, otherwise None.
    """
    # Find the matching country where IP falls within the range
    mask = (df_ip['lower_bound_ip_address'] <= ip_address) & (df_ip['upper_bound_ip_address'] >= ip_address)
    matches = df_ip[mask]
    
    if len(matches) > 0:
        # If multiple matches, take the first one (shouldn't happen with proper ranges)
        return matches.iloc[0]['country']
    return None

# Apply the mapping function to create the country column
df['country'] = df['ip_address'].apply(lambda x: map_ip_to_country(x, df_ip))


### 4. Droping Unwanted Columns

In [28]:
drop_cols = [
    'user_id', 
    'device_id', 
    'ip_address', 
    'signup_time', 
    'purchase_time'
]

df_final = df.drop(columns=drop_cols, errors='ignore')
df_final.head()

Unnamed: 0,purchase_value,source,browser,sex,age,class,account_age_minutes,device_count,user_count_per_device,country
0,34,SEO,Chrome,M,39,0,75111.366667,1,1,Japan
1,16,Ads,Chrome,F,53,0,299.066667,1,1,United States
2,15,SEO,Opera,M,53,1,0.016667,12,12,United States
3,44,SEO,Safari,M,41,0,8201.416667,1,1,
4,39,Ads,Safari,M,45,0,72691.016667,1,1,United States


### 5. Remove Duplicates (after dropping identifying columns)

In [29]:
df_final['country'] = df['country'].fillna('Unknown')

# Round account_age_minutes to avoid floating point precision issues
# (Two rows with 5.0000001 and 5.0000002 minutes should be considered duplicates)
df_final['account_age_minutes'] = df_final['account_age_minutes'].round(2)

# Ensure consistent data types for duplicate detection
df_final['purchase_value'] = df_final['purchase_value'].astype(int)
df_final['device_count'] = df_final['device_count'].astype(int)
df_final['user_count_per_device'] = df_final['user_count_per_device'].astype(int)

# Remove duplicates after dropping identifying columns
# (Different users/devices might have same feature values)
print(f"Rows before removing duplicates: {len(df_final)}")
df_final = df_final.drop_duplicates()
print(f"Rows after removing duplicates: {len(df_final)}")
print(f"Duplicates removed: {len(df) - len(df_final)}")

Rows before removing duplicates: 151112
Rows after removing duplicates: 144272
Duplicates removed: 6840


### 6. Saving Data set


In [30]:
df_final.to_csv("data/processed/feature_engineered.csv",index=False)