# Data overview



In [1]:
# Libraries
import pandas as pd
import numpy as np
from pathlib import Path
# make wide tables easier to read
pd.set_option('display.max_columns', 200)
pd.set_option('display.width', 200)

In [2]:
# Find CSV files
data_dir = Path('..') / 'data' / 'raw'
csv_files = sorted(list(data_dir.glob('*.csv')))
print(f'Found {len(csv_files)} CSV file(s) in {data_dir.resolve()}')
for p in csv_files:
    print('-', p.name)

Found 2 CSV file(s) in C:\Users\DELL\Desktop\SriLanka-Agricultural-Insights\data\raw
- production_of_major_agricultural_crops.csv
- volume_and_value_of_export_agriculture_crops.csv


In [3]:
# quick summary function
def analyze_df(df: pd.DataFrame, name: str):
    print('\n' + '='*80)
    print(f'Dataset: {name}')
    print('Shape:', df.shape)
    print('\nColumns and dtypes:')
    print(df.dtypes)
    # missing values
    missing = df.isnull().sum()
    missing = missing[missing > 0].sort_values(ascending=False)
    if len(missing):
        print('\nColumns with missing values (count):')
        print(missing)
        print('\nMissing percent (relative to rows):')
        print((missing / len(df) * 100).round(2))
    else:
        print('\nNo missing values detected')
    # duplicates
    dup_count = df.duplicated().sum()
    print(f'\nDuplicate rows: {dup_count}')
    # quick stats
    print('\nNumeric summary:')
    display(df.select_dtypes(include=[np.number]).describe().T)
    print('\nCategorical summary (top values):')
    display(df.select_dtypes(include=['object', 'category']).describe().T)
    # detect years in columns
    cols_lower = [c.lower() for c in df.columns]
    year_cols = [c for c in df.columns if c.lower() in ('year','yr','period','date')]
    # also detect integer-like year columns
    if year_cols:
        for yc in year_cols:
            try:
                vals = pd.to_numeric(df[yc], errors='coerce').dropna()
                if len(vals):
                    print(f'\nDetected year-like column: {yc} — range: {int(vals.min())} to {int(vals.max())}')
            except Exception:
                pass
    else:
        for c in df.select_dtypes(include=[np.number]).columns:
            vals = df[c].dropna()
            if vals.dtype.kind in 'ifu':
                vmin, vmax = vals.min(), vals.max()
                if 1800 <= vmin <= 2100 and 1800 <= vmax <= 2100:
                    print(f'\nDetected possible year column: {c} — range: {int(vmin)} to {int(vmax)}')
    print('\n' + '='*80 + '\n')

In [4]:
# load CSVs and analyze
dfs = {}
for p in csv_files:
    try:
        df = pd.read_csv(p, low_memory=False)
        dfs[p.name] = df
    except Exception as e:
        print(f'Failed to read {p.name}:', e)

# run analyzer for each file
for name, df in dfs.items():
    analyze_df(df, name)


Dataset: production_of_major_agricultural_crops.csv
Shape: (13, 8)

Columns and dtypes:
Crop     object
2006    float64
2007    float64
2008    float64
2009    float64
2010    float64
2011    float64
2012    float64
dtype: object

No missing values detected

Duplicate rows: 0

Numeric summary:


Unnamed: 0,count,mean,std,min,25%,50%,75%,max
2006,13.0,3752.076923,5248.200296,56.0,310.8,1944.0,3500.0,15900.0
2007,13.0,4145.369231,5750.876129,29.0,305.2,2869.0,3131.0,16597.0
2008,13.0,3945.453846,4957.576127,39.0,318.7,2174.0,3875.0,14899.0
2009,13.0,3786.992308,5475.207211,32.0,291.0,2400.0,3125.0,15767.0
2010,13.0,4463.584615,6092.353822,31.0,331.4,2376.0,4301.0,17332.0
2011,13.0,3751.361538,5291.368045,35.0,327.5,2206.0,3894.0,18250.0
2012,13.0,4205.8,6243.059214,36.0,328.4,2002.0,3846.0,18604.0



Categorical summary (top values):


Unnamed: 0,count,unique,top,freq
Crop,13,13,Tea (mn kg),1





Dataset: volume_and_value_of_export_agriculture_crops.csv
Shape: (6, 15)

Columns and dtypes:
Crop                       object
   Volume 2006            float64
   Value 2006(US $ mn)    float64
   Volume 2007            float64
   Value 2007(US $ mn)    float64
   Volume 2008            float64
   Value 2008(US $ mn)    float64
   Volume 2009            float64
   Value2009 (US $ mn)    float64
   Volume 2010            float64
   Value2010 (US $ mn)    float64
   Volume 2011            float64
   Value 2011(US $ mn)    float64
   Volume 2012            float64
   Value 2012(US $ mn)    float64
dtype: object

Columns with missing values (count):
Crop                      3
   Volume 2006            3
   Value 2006(US $ mn)    3
   Volume 2007            3
   Value 2007(US $ mn)    3
   Volume 2008            3
   Value 2008(US $ mn)    3
   Volume 2009            3
   Value2009 (US $ mn)    3
   Volume 2010            3
   Value2010 (US $ mn)    3
   Volume 2011            3
   Va

Unnamed: 0,count,mean,std,min,25%,50%,75%,max
Volume 2006,3.0,276.666667,209.362111,46.6,187.0,327.4,391.7,456.0
Value 2006(US $ mn),3.0,366.066667,446.384278,93.1,108.5,123.9,502.55,881.2
Volume 2007,3.0,273.366667,205.499205,51.4,181.55,311.7,384.35,457.0
Value 2007(US $ mn),3.0,420.966667,523.195378,109.4,118.95,128.5,576.75,1025.0
Volume 2008,3.0,249.531721,176.629539,48.617861,184.111071,319.604281,349.98865,380.37302
Value 2008(US $ mn),3.0,514.022253,656.505681,124.9334,135.033379,145.133358,708.566679,1272.0
Volume 2009,3.0,237.444586,161.782214,55.990664,172.850009,289.709354,328.171548,366.633741
Value2009 (US $ mn),3.0,482.026407,609.890266,98.541032,130.386436,162.23184,673.769094,1185.306348
Volume 2010,3.0,210.347104,142.533199,51.863961,151.503723,251.143484,289.588676,328.033867
Value2010 (US $ mn),3.0,593.210309,733.887693,165.831169,169.505362,173.179555,806.899879,1440.620203



Categorical summary (top values):


Unnamed: 0,count,unique,top,freq
Crop,3,3,Tea (mn kg),1






In [5]:
# show samples and detect years in headers
import re
def extract_years_from_cols(df):
    years = []
    for c in df.columns:
        found = re.findall(r'\b(19\d{2}|20\d{2})\b', str(c))
        years.extend([int(y) for y in found])
    return sorted(set(years))

for name, df in dfs.items():
    print('\n' + name)
    print('Shape:', df.shape)
    display(df.head())
    try:
        display(df.sample(min(5, len(df))))
    except Exception:
        pass
    yrs = extract_years_from_cols(df)
    if yrs:
        print('Detected year columns in headers: ', yrs[0], 'to', yrs[-1])
    else:
        print('No year-like column names detected in headers')
    print('---')


production_of_major_agricultural_crops.csv
Shape: (13, 8)


Unnamed: 0,Crop,2006,2007,2008,2009,2010,2011,2012
0,Tea (mn kg),310.8,305.2,318.7,291.0,331.4,327.5,328.4
1,Rubber (mn kg),109.2,117.6,129.2,136.9,153.2,158.2,152.0
2,Coconut (mn nuts),2785.0,2869.0,2909.0,2762.0,2584.0,2808.0,2940.0
3,Coffee (mt),3500.0,2979.0,3081.0,3125.0,3164.0,2974.0,3000.0
4,Cocoa (mt),810.0,393.0,1695.0,467.0,520.0,525.0,513.0


Unnamed: 0,Crop,2006,2007,2008,2009,2010,2011,2012
4,Cocoa (mt),810.0,393.0,1695.0,467.0,520.0,525.0,513.0
3,Coffee (mt),3500.0,2979.0,3081.0,3125.0,3164.0,2974.0,3000.0
12,Sugar Production (by Year) (mt '000),56.0,29.0,39.0,32.0,31.0,35.0,36.0
8,Cardamom (mt),80.0,90.0,71.0,61.0,48.0,57.0,80.0
5,Cinnamon (mt),15900.0,16505.0,14899.0,15765.0,16435.0,18250.0,17165.0


Detected year columns in headers:  2006 to 2012
---

volume_and_value_of_export_agriculture_crops.csv
Shape: (6, 15)


Unnamed: 0,Crop,Volume 2006,Value 2006(US $ mn),Volume 2007,Value 2007(US $ mn),Volume 2008,Value 2008(US $ mn),Volume 2009,Value2009 (US $ mn),Volume 2010,Value2010 (US $ mn),Volume 2011,Value 2011(US $ mn),Volume 2012,Value 2012(US $ mn)
0,Tea (mn kg),327.4,881.2,311.7,1025.0,319.604281,1272.0,289.709354,1185.306348,328.033867,1440.620203,323.01208,1490.898234,319.945774,1411.9187
1,Rubber (mn kg),46.6,93.1,51.4,109.4,48.617861,124.9334,55.990664,98.541032,51.863961,173.179555,42.60641,206.31504,37.377057,125.107677
2,Coconut (mn nuts) (numbers are given und...,456.0,123.9,457.0,128.5,380.37302,145.133358,366.633741,162.23184,251.143484,165.831169,385.727824,265.974372,350.694604,208.898246
3,,,,,,,,,,,,,,,
4,,,,,,,,,,,,,,,


Unnamed: 0,Crop,Volume 2006,Value 2006(US $ mn),Volume 2007,Value 2007(US $ mn),Volume 2008,Value 2008(US $ mn),Volume 2009,Value2009 (US $ mn),Volume 2010,Value2010 (US $ mn),Volume 2011,Value 2011(US $ mn),Volume 2012,Value 2012(US $ mn)
4,,,,,,,,,,,,,,,
2,Coconut (mn nuts) (numbers are given und...,456.0,123.9,457.0,128.5,380.37302,145.133358,366.633741,162.23184,251.143484,165.831169,385.727824,265.974372,350.694604,208.898246
5,,,,,,,,,,,,,,,
0,Tea (mn kg),327.4,881.2,311.7,1025.0,319.604281,1272.0,289.709354,1185.306348,328.033867,1440.620203,323.01208,1490.898234,319.945774,1411.9187
1,Rubber (mn kg),46.6,93.1,51.4,109.4,48.617861,124.9334,55.990664,98.541032,51.863961,173.179555,42.60641,206.31504,37.377057,125.107677


Detected year columns in headers:  2006 to 2012
---
