In [1]:
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
import numpy as np
import re

# Set plot style
sns.set(style="whitegrid")

In [2]:
number_of_bids_filter = 3

In [3]:
try:
    df = pd.read_csv("df.csv")
    print("Data loaded successfully.")
except Exception as e:
    print(f"Error loading data: {e}")

df['date'] = pd.to_datetime(df['date'], errors='coerce')
df.dropna(subset=['date'], inplace=True)
print(f"Min Date: {df['date'].min()}")
print(f"Max Date: {df['date'].max()}")

Data loaded successfully.
Min Date: 2025-09-01 00:00:00
Max Date: 2025-11-22 00:00:00


  df = pd.read_csv("df.csv")


In [4]:
print("Total number of rows:", len(df))
print("Number of rows with missing spec_id:", df['spec_id'].isna().sum())
df = df.dropna(subset=['spec_id'])
print("Number of rows after removing missing spec_id:", len(df))


Total number of rows: 295450
Number of rows with missing spec_id: 22176
Number of rows after removing missing spec_id: 273274


In [5]:
def clean_grade_column(val):
    s = str(val).lower().strip().replace("g", "").replace("_", ".")
    
    if s in ['nan', 'none', '', '0']:
        return np.nan

    if 'auth' in s:
        return np.nan

    if any(x in s for x in ['pristine', 'perfect', '10p', '10b']):
        return 10.0

    match = re.search(r"(\d+(\.\d+)?)", s)
    if match:
        try:
            return float(match.group(1))
        except ValueError:
            return np.nan

    return np.nan

In [6]:
print("Counts of each grade before cleaning:")
print(df['grade'].value_counts(dropna=False))

Counts of each grade before cleaning:
grade
g10                  134901
g9                    73478
g8                    24926
g7                     8655
g10pristine            8132
g6                     5478
g9_5                   3937
g5                     3097
g8_5                   3031
g4                     1709
g10p                   1424
g1                      865
g3                      824
g7_5                    734
g2                      467
g10b                    404
auth                    356
g6_5                    281
g5_5                    185
g4_5                     91
g10perfect               85
g3_5                     59
authentic                55
g1_5                     32
g2_5                     30
g0                       22
authentic_altered        11
NaN                       5
Name: count, dtype: int64


In [7]:
df['grade'] = df['grade'].apply(clean_grade_column)
df = df.dropna(subset=['grade'])
print("Counts of each grade after cleaning:")
print(df['grade'].value_counts().sort_index(ascending=False))
print("Number of rows after cleaning grade", len(df))

Counts of each grade after cleaning:
grade
10.0    144946
9.5       3937
9.0      73478
8.5       3031
8.0      24926
7.5        734
7.0       8655
6.5        281
6.0       5478
5.5        185
5.0       3097
4.5         91
4.0       1709
3.5         59
3.0        824
2.5         30
2.0        467
1.5         32
1.0        865
Name: count, dtype: int64
Number of rows after cleaning grade 272825


In [8]:
def group_currencies(val):
    s = str(val).strip()

    if s.startswith('$') or s[0].isdigit():
        return '$ (No Country Code)'

    return s

currency_groups = df['price'].str.split().str[0].apply(group_currencies)

print("Currnecy of sale:")
currency_groups.value_counts()

Currnecy of sale:


price
$ (No Country Code)    137042
US                     128879
EUR                      2596
C                        1969
GBP                      1413
AU                        926
Name: count, dtype: int64

In [9]:
df = df.loc[currency_groups.isin(['$ (No Country Code)', 'US'])]
df['price'] = df['price'].astype(str).str.replace(r'\D+', '', regex=True).astype(int)
print("Number of rows after cleaning sale price", len(df))

Number of rows after cleaning sale price 265921


In [10]:
print("Count of each grader:", df['grader'].value_counts())

Count of each grader: grader
PSA    228159
CGC     31209
BGS      6284
Name: count, dtype: int64


In [11]:
print("Number of bids raw:")
df["number_of_bids"].describe()

Number of bids raw:


count    265921.000000
mean         16.007923
std          12.786622
min           0.000000
25%           7.000000
50%          13.000000
75%          22.000000
max         629.000000
Name: number_of_bids, dtype: float64

In [12]:
print(f"Number of bids filter at >={number_of_bids_filter}")
df = df.loc[df["number_of_bids"] >= number_of_bids_filter]
df["number_of_bids"].describe()

Number of bids filter at >=3


count    237100.000000
mean         17.799895
std          12.398225
min           3.000000
25%           9.000000
50%          15.000000
75%          23.000000
max         629.000000
Name: number_of_bids, dtype: float64

In [13]:
weekly_grouper = pd.Grouper(key='date', freq='W')
print("Transactions per week:")
df.groupby(weekly_grouper).size()

Transactions per week:


date
2025-09-07    17663
2025-09-14    28123
2025-09-21    66820
2025-09-28    16623
2025-10-05    10457
2025-10-12    11127
2025-10-19    17862
2025-10-26    25738
2025-11-02    15637
2025-11-09    16697
2025-11-16     9080
2025-11-23     1273
Freq: W-SUN, dtype: int64

In [14]:
grouper = pd.Grouper(key='date', freq='ME')

grouped = df.groupby([grouper, "spec_id", "grade", "grader"])['price'].mean()

comparison_df = grouped.unstack(level='grader')

if 'PSA' in comparison_df.columns and 'BGS' in comparison_df.columns:
    comparison_df['PSA_vs_BGS_Ratio'] = comparison_df['PSA'] / comparison_df['BGS']

if 'PSA' in comparison_df.columns and 'CGC' in comparison_df.columns:
    comparison_df['PSA_vs_CGC_Ratio'] = comparison_df['PSA'] / comparison_df['CGC']

In [15]:
grade_analysis = comparison_df.groupby(level='grade')[['PSA_vs_BGS_Ratio', 'PSA_vs_CGC_Ratio']].agg(['mean', 'std', 'count'])

print(grade_analysis)

grader PSA_vs_BGS_Ratio                 PSA_vs_CGC_Ratio                
                   mean       std count             mean       std count
grade                                                                   
1.0                 NaN       NaN     0              NaN       NaN     0
1.5                 NaN       NaN     0              NaN       NaN     0
2.0            1.095806  0.375187     2         1.385542       NaN     1
2.5                 NaN       NaN     0              NaN       NaN     0
3.0            1.293103       NaN     1         1.620631  0.086829     2
3.5                 NaN       NaN     0              NaN       NaN     0
4.0            1.428890  0.422331     6         2.931586  5.210093    13
4.5                 NaN       NaN     0              NaN       NaN     0
5.0            1.350861  0.642564    11         1.414997  0.675925    28
5.5                 NaN       NaN     0              NaN       NaN     0
6.0            1.413499  0.436565    12         1.4

In [16]:
grouper = pd.Grouper(key='date', freq='W')
grouped = df.groupby([grouper, "spec_id", "grade", "grader"])['price'].mean()

price_df = grouped.unstack(level='grade')

ratio_data = []

for g in range(1, 10):
    whole = float(g)
    half = g + 0.5
    
    if whole in price_df.columns and half in price_df.columns:
        ratios = price_df[half] / price_df[whole]
        
        temp_df = pd.DataFrame({
            'grade_step': f"{whole} -> {half}",
            'ratio': ratios
        })
        ratio_data.append(temp_df)

if ratio_data:
    all_ratios = pd.concat(ratio_data)
    
    stats_by_grade = all_ratios.groupby('grade_step')['ratio'].agg(['median', 'std', "count"])

    # Calculate Total stats across all grades
    total_mean = all_ratios['ratio'].mean()
    total_std = all_ratios['ratio'].std()

    print("Statistics by Grade Step:")
    print(stats_by_grade)
    print(f"\nOverall Mean: {total_mean:.4f}")
    print(f"Overall Std Dev: {total_std:.4f}")

Statistics by Grade Step:
              median        std  count
grade_step                            
1.0 -> 1.5  1.318440   0.553333      4
2.0 -> 2.5  0.872340        NaN      1
3.0 -> 3.5  1.566879        NaN      1
4.0 -> 4.5  1.204545        NaN      1
5.0 -> 5.5  1.950000   0.470072      3
6.0 -> 6.5  0.983871   0.953004      9
7.0 -> 7.5  1.077456   0.548506     29
8.0 -> 8.5  1.134103   1.726320    256
9.0 -> 9.5  1.343858  16.551284    332

Overall Mean: 1.9972
Overall Std Dev: 12.0179
