# EDA_2 Data Frame explore, clean and split

In [None]:
# Data conversion, and exported a "_v2" named completed dataset
import pandas as pd
import numpy as np
from datetime import datetime

input_path = "C:/Users/user/Desktop/....../teleprompter_hashed_fixed.csv"
output_path = "C:/Users/user/Desktop/....../teleprompter_hashed_fixed_v2.csv"

# 1. Retrieve data in an optimized manner
print("Data retrieve...")
dtypes = {
    'event': 'category',
    'distinct_id': 'string',
    'os_version': 'category',
    'country_code': 'category',
    'event_time': 'int64'
}
telprom = pd.read_csv(input_path, sep=";", dtype=dtypes)

# 2. Date conversion optimized
print("Dateconversion...")
def convert_timestamp(ts):
    try:
        dt = datetime.utcfromtimestamp(ts)
        return dt.strftime("%Y.%m.%d"), dt.strftime("%H:%M:%S"), dt.strftime("%Y.%m.%d. %H:%M:%S")
    except:
        return np.nan, np.nan, np.nan

# 3. Batch processing saves memory
chunk_size = 1_000_000
results = []
for i, chunk in enumerate(np.array_split(telprom, len(telprom) // chunk_size + 1)):
    print(f"Processing: {i*chunk_size}/{len(telprom)} line")
    
    # Create of a date field
    dates, times, datetimes = zip(*chunk['event_time'].apply(convert_timestamp))
    
    chunk['event_date'] = dates
    chunk['event_time'] = times
    chunk['event_datetime'] = datetimes
    
    # Rearrange columns
    chunk = chunk[['event', 'event_date', 'event_time', 'distinct_id', 'os_version', 'country_code', 'event_datetime']]
    
    results.append(chunk)

# 4. Merge and save results
print("Result save...")
telprom_final = pd.concat(results)

# Memory release
del telprom, results
import gc
gc.collect()

# Save to CSV
telprom_final.to_csv(output_path, index=False, sep=";")

print("Done! Saved hier:", output_path)
print("Rearrange columns:", telprom_final.columns.tolist())
print("Samples:")
print(telprom_final.head())

In [2]:
telprom_final.head()

Unnamed: 0,event,event_date,event_time,distinct_id,os_version,country_code,event_datetime
0,app_close,2025.02.16,02:59:49,$device:user_03108709407604,unknown,unknown,2025.02.16. 02:59:49
1,app_open,2025.02.16,02:59:51,$device:user_03108709407604,unknown,unknown,2025.02.16. 02:59:51
2,PlayerViewController.startStopScrollingAction(...,2025.02.16,03:00:06,$device:user_03108709407604,unknown,unknown,2025.02.16. 03:00:06
3,PlayerViewController.startStopScrollingAction(...,2025.02.16,03:00:07,$device:user_03108709407604,unknown,unknown,2025.02.16. 03:00:07
4,PlayerViewController.startStopScrollingAction(...,2025.02.16,03:00:07,$device:user_03108709407604,unknown,unknown,2025.02.16. 03:00:07


## Extracting the 10 key event types from the full database

In [None]:
# It reads the "_v2" csv Dataframe array and sorts out the 10 important event types, then puts the finished small -
# arrays into the new folder.

import pandas as pd
import numpy as np
import os

input_file_path = "C:/Users/user/Desktop/......./teleprompter_hashed_fixed_v2.csv"
        
telprom = pd.read_csv(input_file_path, sep=";")

events_to_keep = [
    "billing_issue_event", "cancellation_event", "expiration_event",
    "initial_purchase_event", "product_change_event", "renewal_event",
    "trial_cancelled_event", "trial_converted_event", "trial_started_event",
    "uncancellation_event"
]

output_directory = "C:/Users/user/Desktop/......./PY_10_PYClean_Source_V5/"

# Ensure that the folder exists
os.makedirs(output_directory, exist_ok=True)

# Filter by events and save to file
for event in events_to_keep:
    df_filtered = telprom[telprom["event"] == event]  # Filter for a specific event
    output_path = os.path.join(output_directory, f"{event}.csv")  # Path setting
    df_filtered.to_csv(output_path, index=False)  # Save to CSV
    print(f"Saved: {output_path}")

print("✅ All files are complete and saved!")

## The distinct_id of the dataset groups its into 3 types based on their unique prefixes

In [None]:
# Dataset reads from the V5 folder and sorts the distinct_id based on different prefixes into 3 types. Then it is -
# copleted dataset transferred  into a V6 folder 
import os
import pandas as pd
import numpy as np
from pathlib import Path

# Define paths
source_dir = r"C:\Users\user\Desktop\.....\PY_10_PYClean_Source_V5"
output_dir = r"C:\Users\user\Desktop\.....\PY_10_PYClean_Source_V6"

# Create output directory if it doesn't exist
Path(output_dir).mkdir(parents=True, exist_ok=True)

def process_distinct_id(df):
    """Process distinct_id column according to the specified rules"""
    # Create new column initialized with NaN
    df['distinct_type'] = np.nan
    
    # Pattern 1: user_ prefix (Type A)
    mask_a = df['distinct_id'].str.startswith('user_', na=False)
    df.loc[mask_a, 'distinct_id'] = df.loc[mask_a, 'distinct_id'].str[5:]
    df.loc[mask_a, 'distinct_type'] = 'A'
    
    # Pattern 2: $device:user_ prefix (Type B)
    mask_b = df['distinct_id'].str.startswith('$device:user_', na=False)
    df.loc[mask_b, 'distinct_id'] = df.loc[mask_b, 'distinct_id'].str[13:]
    df.loc[mask_b, 'distinct_type'] = 'B'
    
    # Pattern 3: $RCAnonymousID:user_ prefix (Type C)
    mask_c = df['distinct_id'].str.startswith('$RCAnonymousID:user_', na=False)
    df.loc[mask_c, 'distinct_id'] = df.loc[mask_c, 'distinct_id'].str[20:]
    df.loc[mask_c, 'distinct_type'] = 'C'
    
    return df

# Process all CSV files in the source directory
for filename in os.listdir(source_dir):
    if filename.endswith('.csv'):
        print(f"Processing {filename}...")
        
        # Read CSV with optimized memory usage
        filepath = os.path.join(source_dir, filename)
        df = pd.read_csv(filepath, dtype={'distinct_id': 'string'})
        
        # Process the data
        df = process_distinct_id(df)
        
        # Save to new location
        output_path = os.path.join(output_dir, filename)
        df.to_csv(output_path, index=False)
        
        print(f"Saved processed file to {output_path}")

print("All files processed successfully!")

In [7]:
import pandas as pd

input_file_path = "C:/Users/user/Desktop/...../PY_10_PYClean_Source_V6/trial_converted_event.csv"
                  
telprom = pd.read_csv(input_file_path, sep=",")

In [8]:
telprom.head()

Unnamed: 0,event,event_date,event_time,distinct_id,os_version,country_code,event_datetime,distinct_type
0,trial_converted_event,2025.01.08,02:08:07,28590599158315,18.2,unknown,2025.01.08. 02:08:07,A
1,trial_converted_event,2025.01.07,14:26:19,33168113216396,18.1.1,unknown,2025.01.07. 14:26:19,B
2,trial_converted_event,2025.02.26,06:23:29,38080551652614,18.2.1,unknown,2025.02.26. 06:23:29,B
3,trial_converted_event,2025.02.23,14:40:21,68665611857065,18.1.1,unknown,2025.02.23. 14:40:21,C
4,trial_converted_event,2025.02.18,00:31:07,88738418324176,18.1.1,unknown,2025.02.18. 00:31:07,B
