In [2]:
import pandas as pd
import numpy as np
import pyarrow.parquet as pq
from memory_profiler import profile
import matplotlib.pyplot as plt
import seaborn as sns
import os
import warnings

In [3]:
warnings.filterwarnings("ignore")
@profile
def load_data_with_memory_optimization(file_path):
    """
    内存优化的数据加载函数
    使用PyArrow的Parquet读取器进行分块处理
    """
    # 首先只读取元数据获取列信息
    parquet_file = pq.ParquetFile(file_path)
    
    # 获取列的数据类型
    schema = parquet_file.schema.to_arrow_schema()
    column_types = {name: str(type_) for name, type_ in zip(schema.names, schema.types)}
    
    # 定义优化的数据类型
    dtype_mapping = {}
    for col, col_type in column_types.items():
        if 'int' in col_type:
            dtype_mapping[col] = 'int32'
        elif 'float' in col_type:
            dtype_mapping[col] = 'float32'
        elif 'bool' in col_type:
            dtype_mapping[col] = 'bool'
        else:
            # 对于对象类型，尝试转换为category如果基数低
            dtype_mapping[col] = 'category'
    
    # 分块读取数据
    chunk_size = 100000  # 根据内存调整
    chunks = []
    for batch in parquet_file.iter_batches(batch_size=chunk_size):
        df_chunk = batch.to_pandas()
        
        # 优化数据类型
        for col, dtype in dtype_mapping.items():
            if col in df_chunk.columns:
                try:
                    if dtype == 'category':
                        # 仅当唯一值较少时才转换为category
                        if len(df_chunk[col].unique()) / len(df_chunk) < 0.1:
                            df_chunk[col] = df_chunk[col].astype('category')
                    else:
                        df_chunk[col] = df_chunk[col].astype(dtype)
                except (ValueError, TypeError):
                    pass
        
        chunks.append(df_chunk)
    
    # 合并分块
    df = pd.concat(chunks, ignore_index=True)
    
    return df

# 使用示例
train_path = 'E:/GIT PROJECT/FR/kaggle/input/data/aeroclub-recsys-2025/processed_train.parquet'
train_df = load_data_with_memory_optimization(train_path)

# 显示基本信息
# print(f"数据集维度: {train_df.shape}")
# print("\n前5行数据:")
# print(train_df.head())
# print("\n数据类型:")
# print(train_df.dtypes)
# print("\n内存使用:")
# print(train_df.memory_usage(deep=True).sum() / (1024**2), "MB")

ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_158484\1171195580.py


In [None]:
@profile
def analyze_group_distribution(df, group_col='ranker_id'):
    """
    分析组大小分布，内存优化版本
    """
    # 使用groupby的size()方法，这是内存效率最高的方式
    group_sizes = df.groupby(group_col).size()
    
    # 转换为更节省内存的Series
    group_sizes = group_sizes.astype('int32')
    
    # 计算统计信息
    stats = {
        'total_groups': len(group_sizes),
        'mean_size': group_sizes.mean(),
        'median_size': group_sizes.median(),
        'min_size': group_sizes.min(),
        'max_size': group_sizes.max(),
        'std_size': group_sizes.std(),
        'percentiles': group_sizes.quantile([0.25, 0.5, 0.75, 0.9, 0.95, 0.99]).to_dict()
    }
    
    # 可视化组大小分布（对数尺度）
    import matplotlib.pyplot as plt
    plt.figure(figsize=(10, 6))
    plt.hist(group_sizes, bins=50, log=True)
    plt.title(f'Distribution of {group_col} group sizes (log scale)')
    plt.xlabel('Group size')
    plt.ylabel('Frequency (log)')
    plt.grid(True)
    plt.savefig('group_size_distribution.png')
    plt.close()
    
    return stats, group_sizes

group_stats, group_sizes = analyze_group_distribution(train_df)
print("\n组大小统计信息:")
for k, v in group_stats.items():
    print(f"{k}: {v}")

In [None]:
@profile
def analyze_user_segments(df):
    """
    分析用户细分市场模式，内存优化版本
    """
    # 选择关键用户特征进行分析
    user_cols = ['profileId', 'companyID', 'sex', 'nationality', 
                'frequentFlyer', 'isVip', 'bySelf', 'isAccess3D']
    
    # 确保这些列存在
    user_cols = [col for col in user_cols if col in df.columns]
    
    # 创建用户特征DataFrame，删除重复项
    user_features = df[user_cols].drop_duplicates(subset=['profileId'])
    
    # 分析公司分布
    company_dist = user_features['companyID'].value_counts(normalize=True)
    
    # 分析用户人口统计
    demo_stats = {}
    for col in ['sex', 'nationality', 'frequentFlyer', 'isVip', 'bySelf', 'isAccess3D']:
        if col in user_features.columns:
            demo_stats[col] = user_features[col].value_counts(normalize=True).to_dict()
    
    # 分析公司政策影响
    if 'corporateTariffCode' in df.columns:
        policy_impact = df.groupby(['companyID', 'corporateTariffCode']).size().unstack(fill_value=0)
    else:
        policy_impact = None
    
    return {
        'company_distribution': company_dist.to_dict(),
        'demographic_stats': demo_stats,
        'policy_impact': policy_impact
    }

# 使用示例
user_segment_stats = analyze_user_segments(train_df)
print("\n用户细分市场分析:")
print("公司分布:", user_segment_stats['company_distribution'])
print("\n人口统计:")
for k, v in user_segment_stats['demographic_stats'].items():
    print(f"{k}: {v}")

In [None]:
@profile
def analyze_flight_features(df, sample_size=100000):
    """
    分析航班特征分布，使用采样减少内存使用
    """
    # 对大型数据集进行采样
    if len(df) > sample_size:
        df_sample = df.sample(n=sample_size, random_state=42)
    else:
        df_sample = df.copy()
    
    # 选择关键航班特征
    flight_cols = ['totalPrice', 'taxes', 'legs0_duration', 'legs1_duration',
                  'legs0_segments0_cabinClass', 'legs1_segments0_cabinClass',
                  'miniRules0_statusInfos', 'miniRules1_statusInfos',
                  'pricingInfo_isAccessTP']
    
    # 确保这些列存在
    flight_cols = [col for col in flight_cols if col in df_sample.columns]
    
    # 分析数值特征
    num_features = ['totalPrice', 'taxes', 'legs0_duration', 'legs1_duration']
    num_features = [col for col in num_features if col in flight_cols]
    num_stats = df_sample[num_features].describe().to_dict()
    
    # 分析分类特征
    cat_features = [col for col in flight_cols if col not in num_features]
    cat_stats = {}
    for col in cat_features:
        cat_stats[col] = df_sample[col].value_counts(normalize=True).to_dict()
    
    # 可视化关键特征
    import matplotlib.pyplot as plt
    for col in num_features:
        plt.figure(figsize=(10, 6))
        plt.hist(df_sample[col].dropna(), bins=50)
        plt.title(f'Distribution of {col}')
        plt.xlabel(col)
        plt.ylabel('Frequency')
        plt.grid(True)
        plt.savefig(f'{col}_distribution.png')
        plt.close()
    
    return {
        'numerical_stats': num_stats,
        'categorical_stats': cat_stats
    }

# 使用示例
flight_feature_stats = analyze_flight_features(train_df)
print("\n航班特征分析:")
print("数值特征统计:")
for k, v in flight_feature_stats['numerical_stats'].items():
    print(f"{k}: {v}")
print("\n分类特征统计:")
for k, v in flight_feature_stats['categorical_stats'].items():
    print(f"{k}: {v}")

In [None]:
import pyarrow.parquet as pq
import pandas as pd

def analyze_null_columns(parquet_path, threshold=99):
    """
    分析Parquet文件中空值比例过高的特征
    
    参数:
        parquet_path: Parquet文件路径
        threshold: 空值比例阈值(百分比)，默认99%
    
    返回:
        DataFrame包含空值比例超过阈值的特征及其统计信息
    """
    # 1. 打开Parquet文件
    parquet_file = pq.ParquetFile(parquet_path)
    
    # 2. 获取列名和初始化计数器
    schema = parquet_file.schema_arrow  # 使用schema_arrow替代schema
    columns = schema.names
    null_counts = {col: 0 for col in columns}
    total_rows = parquet_file.metadata.num_rows
    
    print(f"开始分析文件: {parquet_path}")
    print(f"总行数: {total_rows:,}")
    print(f"总特征数: {len(columns)}")
    
    # 3. 分批次读取
    batch_size = 5000  # 根据内存调整
    processed_rows = 0
    
    for batch in parquet_file.iter_batches(batch_size=batch_size):
        df_batch = batch.to_pandas()
        processed_rows += len(df_batch)
        
        # 更新进度信息
        progress = (processed_rows / total_rows) * 100
        print(f"\r处理进度: {progress:.1f}% ({processed_rows:,}/{total_rows:,}行)", end="", flush=True)
        
        for col in columns:
            if col in df_batch.columns:
                null_counts[col] += df_batch[col].isnull().sum()
    
    print("\n分析完成!")
    
    # 4. 计算空值比例
    null_percent = {col: (null_counts[col] / total_rows) * 100 for col in columns}
    
    # 5. 创建结果DataFrame
    result_df = pd.DataFrame({
        'feature': columns,
        'null_count': [null_counts[col] for col in columns],
        'null_percent': [null_percent[col] for col in columns],
        'dtype': [str(schema.field(col).type) for col in columns]  # 使用schema.field
    })
    
    # 6. 筛选并排序高比例空值特征
    high_null_df = result_df[result_df['null_percent'] > threshold]\
                     .sort_values('null_percent', ascending=False)
    
    return high_null_df

# 使用示例
try:
    high_null_features = analyze_null_columns(
        'E:/GIT PROJECT/FR/kaggle/input/data/aeroclub-recsys-2025/train.parquet',
        threshold=-1  # 可以调整此阈值
    )

    print(len(high_null_features))
    # 打印结果
    if not high_null_features.empty:
        print("\n空值比例超过阈值的特征:")
        print(high_null_features[['feature', 'null_percent', 'dtype']].to_string(index=False))
        
        # 保存结果到CSV
        high_null_features.to_csv('high_null_features.csv', index=False)
        print("\n结果已保存到 high_null_features.csv")
    else:
        print("\n没有发现空值比例超过阈值的特征")
        
except Exception as e:
    print(f"发生错误: {str(e)}")

In [13]:
@profile
def validate_data_consistency(df, sample_size=500000):
    """
    验证数据一致性规则，使用采样减少内存使用
    """
    # 对大型数据集进行采样
    if len(df) > sample_size:
        df_sample = df.sample(n=sample_size, random_state=42)
    else:
        df_sample = df.copy()
    
    # 检查1: 每个ranker_id组只有一个selected=1的记录或全部为0
    selected_counts = df_sample.groupby('ranker_id')['selected'].sum()
    # 修正条件判断：每个组的selected总和要么等于1，要么等于0
    invalid_groups = selected_counts[(selected_counts != 1) & (selected_counts != 0)]
    selected_check = len(invalid_groups) == 0
    
    if 'totalPrice' in df_sample.columns and 'taxes' in df_sample.columns:
        price_check = (df_sample['totalPrice'] >= df_sample['taxes']).all()
    else:
        price_check = "Columns not available"
    
    # 检查3: 出发时间早于到达时间（改进部分）
    time_checks = {}
    time_format_errors = {}
    
    def safe_convert_to_datetime(series):
        """安全转换时间格式，记录错误"""
        try:
            return pd.to_datetime(series, errors='coerce')
        except Exception as e:
            print(f"时间转换错误: {e}")
            return pd.Series([pd.NaT] * len(series), False)
    
    # 处理leg0的时间比较
    if 'legs0_departureAt' in df_sample.columns and 'legs0_arrivalAt' in df_sample.columns:
        # 转换时间格式
        # leg0_departure = safe_convert_to_datetime(df_sample['legs0_departureAt'])
        # leg0_arrival = safe_convert_to_datetime(df_sample['legs0_arrivalAt'])
        leg0_departure = df_sample['legs0_departureAt']
        leg0_arrival = df_sample['legs0_arrivalAt']
        # 检查转换是否成功
        if leg0_departure is not None and leg0_arrival is not None:
            # 计算有效时间记录
            valid_mask = (~leg0_departure.isna()) & (~leg0_arrival.isna())
            valid_count = valid_mask.sum()
            invalid_count = len(df_sample) - valid_count
            
            # 只比较有效时间记录
            if valid_count > 0:
                time_checks['leg0'] = (leg0_arrival[valid_mask] > leg0_departure[valid_mask]).all()
                time_format_errors['leg0_invalid_timestamps'] = invalid_count
            else:
                time_checks['leg0'] = "No valid timestamps"
        else:
            time_checks['leg0'] = "Time conversion failed"
    
    # 处理leg1的时间比较
    if 'legs1_departureAt' in df_sample.columns and 'legs1_arrivalAt' in df_sample.columns:
        # 转换时间格式
        # leg1_departure = safe_convert_to_datetime(df_sample['legs1_departureAt'])
        # leg1_arrival = safe_convert_to_datetime(df_sample['legs1_arrivalAt'])
        leg1_departure = df_sample['legs1_departureAt']
        leg1_arrival = df_sample['legs1_arrivalAt']
        # 检查转换是否成功
        if leg1_departure is not None and leg1_arrival is not None:
            # 计算有效时间记录
            valid_mask = (~leg1_departure.isna()) & (~leg1_arrival.isna())
            valid_count = valid_mask.sum()
            invalid_count = len(df_sample) - valid_count
            
            # 只比较有效时间记录
            if valid_count > 0:
                time_checks['leg1'] = (leg1_arrival[valid_mask] > leg1_departure[valid_mask]).all()
                time_format_errors['leg1_invalid_timestamps'] = invalid_count
            else:
                time_checks['leg1'] = "No valid timestamps"
        else:
            time_checks['leg1'] = "Time conversion failed"
    
    # 检查4: 舱位等级是否有效
    if 'legs0_segments0_cabinClass' in df_sample.columns:
        cabin_classes = df_sample['legs0_segments0_cabinClass'].dropna().unique()
        valid_classes = {1.0, 2.0, 4.0}
        cabin_check = all(x in valid_classes for x in cabin_classes)
    else:
        cabin_check = "Column not available"
    
    return {
        'single_selected_per_group': selected_check,
        'price_greater_than_taxes': price_check,
        'departure_before_arrival': time_checks,
        'invalid_count':invalid_count,
        'time_format_errors': time_format_errors,
        'valid_cabin_classes': cabin_check,
        'invalid_groups': len(invalid_groups)
    }

# 使用示例
consistency_checks = validate_data_consistency(train_df)
print("\n改进版数据一致性检查:")
for k, v in consistency_checks.items():
    print(f"{k}: {v}")

ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_158484\776362505.py

改进版数据一致性检查:
single_selected_per_group: False
price_greater_than_taxes: Columns not available
departure_before_arrival: {'leg0': np.False_, 'leg1': np.False_}
invalid_count: 0
time_format_errors: {'leg0_invalid_timestamps': np.int64(0), 'leg1_invalid_timestamps': np.int64(0)}
valid_cabin_classes: False
invalid_groups: 75


In [2]:
import pandas as pd
import pyarrow.parquet as pq
from pathlib import Path

def validate_flight_times_large(input_path, output_path, chunksize=100000):
    """
    Memory-efficient validation of flight times for large Parquet files.
    
    Args:
        input_path (str): Path to input Parquet file
        output_path (str): Path to output CSV file for invalid rows
        chunksize (int): Number of rows to process at a time
    """
    # Initialize variables
    invalid_rows = []
    first_chunk = True
    output_file = Path(output_path)
    
    # Create Parquet file iterator
    parquet_file = pq.ParquetFile(input_path)
    
    # Process file in chunks
    for batch in parquet_file.iter_batches(batch_size=chunksize):
        df = batch.to_pandas()
        
        # Convert timestamp columns
        timestamp_cols = ['legs0_arrivalAt', 'legs0_departureAt', 
                         'legs1_arrivalAt', 'legs1_departureAt']
        for col in timestamp_cols:
            if col in df.columns:
                try:
                    df[col] = pd.to_datetime(df[col], unit='s')
                except:
                    df[col] = pd.to_datetime(df[col], unit='ms')
        
        # Find invalid rows
        mask = ((df['legs0_arrivalAt'] <= df['legs0_departureAt']) | 
               (df['legs1_arrivalAt'] <= df['legs1_departureAt']))
        chunk_invalid = df[mask]
        
        # Append to results
        if not chunk_invalid.empty:
            if first_chunk:
                chunk_invalid.to_csv(output_file, index=False)
                first_chunk = False
            else:
                chunk_invalid.to_csv(output_file, mode='a', header=False, index=False)
            invalid_rows.append(len(chunk_invalid))
    
    # Print summary
    total_invalid = sum(invalid_rows)
    if total_invalid > 0:
        print(f"Found {total_invalid} invalid rows. Saved to {output_path}")
    else:
        print("All rows have valid arrival/departure times.")
        if output_file.exists():
            output_file.unlink()  # Remove empty file

# Example usage:
input_file = "E:/GIT PROJECT/FR/kaggle/input/data/aeroclub-recsys-2025/processed_train.parquet"
output_file = "E:/GIT PROJECT/FR/invalid_flight_times.csv"
validate_flight_times_large(input_file, output_file, chunksize=100000)

Found 4407328 invalid rows. Saved to E:/GIT PROJECT/FR/invalid_flight_times.csv


In [1]:
import pandas as pd
import numpy as np
from memory_profiler import profile
import pyarrow.parquet as pq

@profile
def validate_airport_city_consistency(df, sample_size=None):
    """
    验证机场城市代码与机场代码是否一一对应（内存优化版本）
    
    参数:
        df: 输入的DataFrame
        sample_size: 采样大小（None表示使用完整数据）
    
    返回:
        dict: 包含验证结果的字典
    """
    # 选择需要的列，减少内存使用
    cols = [
        'profileId',
        'ranker_id'
#  'legs0_segments0_aircraft_code',
#  'legs0_segments0_arrivalTo_airport_city_iata',
#  'legs0_segments0_arrivalTo_airport_iata',
#  'legs0_segments0_departureFrom_airport_iata',
# 'legs0_segments0_flightNumber',
# 'legs0_segments0_operatingCarrier_code',
# 'legs0_segments1_aircraft_code',
# 'legs0_segments1_arrivalTo_airport_city_iata',
#'legs0_segments1_arrivalTo_airport_iata',
# 'legs1_segments0_aircraft_code',
#   'legs1_segments0_arrivalTo_airport_city_iata',
# 'legs1_segments0_arrivalTo_airport_iata',
#   'legs1_segments0_departureFrom_airport_iata',
        
            ]
    
    # 检查列是否存在
    cols = [col for col in cols if col in df.columns]
    if len(cols) != 2:
        return {"error": "Required columns not found in DataFrame"}
    
    # 如果指定了采样大小，则进行采样
    if sample_size and len(df) > sample_size:
        df_sample = df[cols].sample(n=sample_size, random_state=42)
    else:
        df_sample = df[cols].copy()
    
    # 删除缺失值
    df_sample = df_sample.dropna(subset=cols)
    
    # 转换为分类数据类型以减少内存
    df_sample[cols[0]] = df_sample[cols[0]].astype('category')
    df_sample[cols[1]] = df_sample[cols[1]].astype('category')
    
    # 方法1: 使用groupby验证一一对应关系
    # 计算每个城市代码对应的机场代码数量
    city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()
    
    # 找出有多个机场代码的城市
    non_unique_cities = city_to_airport_counts[city_to_airport_counts > 1]
    
    # 方法2: 构建映射字典验证（备选方法）
    mapping_dict = {}
    inconsistent_pairs = []
    
    # 使用迭代器减少内存使用
    for city, airport in df_sample.itertuples(index=False, name=None):
        if city in mapping_dict:
            if mapping_dict[city] != airport:
                inconsistent_pairs.append((city, mapping_dict[city], airport))
        else:
            mapping_dict[city] = airport
    
    # 准备结果
    result = {
        "total_unique_cities": len(city_to_airport_counts),
        "total_unique_airports": df_sample[cols[1]].nunique(),
        "cities_with_multiple_airports": len(non_unique_cities),
        "is_one_to_one": len(non_unique_cities) == 0,
        "inconsistent_pairs_count": len(inconsistent_pairs),
        "sample_inconsistent_pairs": inconsistent_pairs[:10] if inconsistent_pairs else None,
        "sample_non_unique_cities": non_unique_cities.head(10).to_dict() if len(non_unique_cities) > 0 else None
    }
    
    # 如果数据量很大，添加内存使用信息
    if sample_size:
        result["sample_size_used"] = sample_size
    else:
        result["sample_size_used"] = "full dataset"
    
    return result

# 使用示例
if __name__ == "__main__":
    # 示例数据加载 - 使用PyArrow的分批读取功能
    parquet_file = pq.ParquetFile('E:/GIT PROJECT/FR/kaggle/input/data/aeroclub-recsys-2025/train.parquet')
    
    # 初始化结果容器
    final_result = {
        "total_batches_processed": 0,
        "global_is_one_to_one": True,
        "all_inconsistent_pairs": set(),
        "all_non_unique_cities": set()
    }
    
    # 分块处理大数据
    batch_size = 100000  # 每批处理的行数
    for i, batch in enumerate(parquet_file.iter_batches(batch_size=batch_size)):
        # 转换为Pandas DataFrame
        df_chunk = batch.to_pandas()
        
        # 验证当前块
        chunk_result = validate_airport_city_consistency(df_chunk)
        
        # 更新全局结果
        final_result["total_batches_processed"] += 1
        
        if not chunk_result.get('is_one_to_one', True):
            final_result["global_is_one_to_one"] = False
            
            # 收集不一致的配对
            if chunk_result.get('sample_inconsistent_pairs'):
                for pair in chunk_result['sample_inconsistent_pairs']:
                    final_result["all_inconsistent_pairs"].add(pair)
            
            # 收集有多机场的城市
            if chunk_result.get('sample_non_unique_cities'):
                for city in chunk_result['sample_non_unique_cities'].keys():
                    final_result["all_non_unique_cities"].add(city)
        
        # 每处理5个块打印一次进度
        if (i + 1) % 5 == 0:
            print(f"Processed {i+1} batches ({(i+1)*batch_size} rows)...")
            print(f"Current status: {'Consistent' if final_result['global_is_one_to_one'] else 'Inconsistent'}")
    
    # 整理最终结果
    final_result.update({
        "total_inconsistent_pairs": len(final_result["all_inconsistent_pairs"]),
        "sample_inconsistent_pairs": list(final_result["all_inconsistent_pairs"])[:10],
        "total_non_unique_cities": len(final_result["all_non_unique_cities"]),
        "sample_non_unique_cities": list(final_result["all_non_unique_cities"])[:10]
    })
    
    # 删除临时数据
    del final_result["all_inconsistent_pairs"]
    del final_result["all_non_unique_cities"]
    
    print("\nFinal Validation Result:")
    for k, v in final_result.items():
        print(f"{k}: {v}")

ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 5 batches (500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 10 batches (1000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 15 batches (1500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 20 batches (2000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 25 batches (2500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 30 batches (3000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 35 batches (3500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 40 batches (4000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 45 batches (4500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 50 batches (5000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 55 batches (5500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 60 batches (6000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 65 batches (6500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 70 batches (7000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 75 batches (7500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 80 batches (8000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 85 batches (8500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 90 batches (9000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 95 batches (9500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 100 batches (10000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 105 batches (10500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 110 batches (11000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 115 batches (11500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 120 batches (12000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 125 batches (12500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 130 batches (13000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 135 batches (13500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 140 batches (14000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 145 batches (14500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 150 batches (15000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 155 batches (15500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 160 batches (16000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 165 batches (16500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 170 batches (17000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 175 batches (17500000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


Processed 180 batches (18000000 rows)...
Current status: Inconsistent
ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py


  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


ERROR: Could not find file C:\Users\25495\AppData\Local\Temp\ipykernel_24792\1553716684.py

Final Validation Result:
total_batches_processed: 182
global_is_one_to_one: False
total_inconsistent_pairs: 220
sample_inconsistent_pairs: [(3380660, '9fdc72a9627748e0af81138d4e4d1cb2', '110eb937ac9949de8dcf03678a6b9488'), (3506753, '746abfa2f0f847fd92f41be377674b07', '89e02832a70649ad92813f5c208b7fff'), (3506156, '343c4fb82a7d42e5b5da31303b20daf4', 'f8959e193c85492babc6be1a88b673ab'), (3305680, '28ff248bb824478aa9847f111e1e6351', 'f0dc6fafd9254668826a8073c710f6d5'), (3226883, '8d7056cbecd54f10937508ac2b5badad', '175cd97c7d4b42f2a1d576e98fbdfca3'), (3410788, '17925e1011d34f41bf7148e9b1bba446', '11c0573f1c3d433ea1b69ea7eab9861d'), (1119689, 'af901acb1d444252a99534b98667de4a', '50645e8003cd431e9181084348a36e2b'), (3335196, '1292e647c5a943819a557ed204a6749f', 'e38f86ec54844cd788a378b70170dad4'), (564562, '041ced7f3f5247e8990088541fe9201a', 'b6f363b7434940669b34d61ff90aa8a8'), (2500868, 'f57fcfdad08

  city_to_airport_counts = df_sample.groupby(cols[0])[cols[1]].nunique()


 'legs0_segments0_aircraft_code',
 'legs0_segments0_arrivalTo_airport_city_iata',
 'legs0_segments0_arrivalTo_airport_iata',
 'legs0_segments0_departureFrom_airport_iata'，
 'legs0_segments0_flightNumber',449
 'legs0_segments0_operatingCarrier_code',126
  'legs0_segments1_aircraft_code',378
 'legs0_segments1_arrivalTo_airport_city_iata',355
 'legs0_segments1_arrivalTo_airport_iata',376
  'legs1_segments0_aircraft_code'，308
 'legs1_segments0_arrivalTo_airport_city_iata'，285
 'legs1_segments0_arrivalTo_airport_iata'，274
 'legs1_segments0_departureFrom_airport_iata'， 285


CONSISTENT={
            'legs0_segments0_aircraft_code'
         
}
#建议是只保留legs0_segments0_arrivalTo_airport_iata这一项
'legs0_segments0_arrivalTo_airport_iata'

In [6]:
import pandas as pd
import pyarrow.parquet as pq
from sklearn.preprocessing import LabelEncoder
import gc

# 1. 文件路径
file_path = "E:/GIT PROJECT/FR/kaggle/input/data/aeroclub-recsys-2025/train.parquet"
cols_to_load = ['legs0_segments0_arrivalTo_airport_iata', 'selected']  # 只加载需要的列

# 2. 使用pyarrow打开Parquet文件
parquet_file = pq.ParquetFile(file_path)

# 3. 初始化编码器
le = LabelEncoder()

# 4. 第一次遍历：收集所有唯一类别
print("Collecting unique airport codes for LabelEncoder fitting...")
unique_airports = set()
for batch in parquet_file.iter_batches(batch_size=500000, columns=cols_to_load):
    df_batch = batch.to_pandas()
    unique_airports.update(df_batch['legs0_segments0_arrivalTo_airport_iata'].astype(str).unique())
    del df_batch
    gc.collect()

# 5. 拟合LabelEncoder
le.fit(list(unique_airports))

# 5. 第二次遍历：应用编码并保存结果
print("Transforming data...")
output_chunks = []
row_group_size = 500000  # 每个行组的大小

for i in range(parquet_file.num_row_groups):
    # 每次读取一个行组
    df_chunk = parquet_file.read_row_group(i, columns=cols_to_load).to_pandas()
    
    # 应用编码
    df_chunk['airport_encoded'] = le.transform(df_chunk['legs0_segments0_arrivalTo_airport_iata'].astype(str))
    df_chunk['airport_encoded'] = df_chunk['airport_encoded'].astype('uint16')  # 优化内存
    
    # 删除原始列
    df_chunk.drop(columns=['legs0_segments0_arrivalTo_airport_iata'], inplace=True)
    
    output_chunks.append(df_chunk)
    del df_chunk
    gc.collect()

# 6. 合并所有块
df_encoded = pd.concat(output_chunks, ignore_index=True)

# 7. 验证结果
print("\nEncoded Data Sample:")
print(df_encoded.head())
print(f"\nUnique encoded values: {df_encoded['airport_encoded'].nunique()}")

# 8. 保存结果
output_path = "E:/GIT PROJECT/FR/kaggle/input/data/aeroclub-recsys-2025/train_encoded.parquet"
df_encoded.to_parquet(output_path, index=False)
print(f"\nEncoded data saved to {output_path}")

Collecting unique airport codes for LabelEncoder fitting...
Transforming data...

Encoded Data Sample:
   selected  airport_encoded
0         1              239
1         0              362
2         0              362
3         0              362
4         0              362

Unique encoded values: 534

Encoded data saved to E:/GIT PROJECT/FR/kaggle/input/data/aeroclub-recsys-2025/train_encoded.parquet


In [2]:
import pandas as pd

df = pd.read_parquet('../data/aeroclub-recsys-2025/encode/train/train_segment_2_encoded.parquet')

# 创建一个新的DataFrame来存储结果
result_df = pd.DataFrame(columns=df.columns)

# 为每列获取前5个非空唯一值，并格式化为"[type] value"
for column in df.columns:
    non_na_values = df[column].dropna()
    unique_values = non_na_values.unique()[:5]
    dtype = str(df[column].dtype)
    
    # 格式化为"[type] value"并填充到结果DataFrame中
    formatted_values = [f"[{dtype}] {value}" for value in unique_values]
    result_df[column] = pd.Series(formatted_values)

# 保存为CSV文件
result_df.to_csv('output.csv', index=False)

print("处理完成，结果已保存到output.csv")

处理完成，结果已保存到output.csv
