In [37]:
import gdown
import pandas as pd

# Google Sheets URL
url = "https://docs.google.com/spreadsheets/d/1ymGFVOlGGp2SKvMfTHN7tyF4SAiIX7PTkY8cbRPr7AM/export?format=csv&gid=0"

# Download the CSV file
output = 'test1.csv'
gdown.download(url, output, quiet=False)

# Read the CSV file
df = pd.read_csv(output)

# Check the size of the CSV file
print(f"Number of rows: {df.shape[0]}")
print(f"Number of columns: {df.shape[1]}")

Downloading...
From: https://docs.google.com/spreadsheets/d/1ymGFVOlGGp2SKvMfTHN7tyF4SAiIX7PTkY8cbRPr7AM/export?format=csv&gid=0
To: /Users/gspinaci/projects/mb_diaries/MB_annotations_extractor/evaluation/1/test1.csv
249kB [00:00, 13.8MB/s]

Number of rows: 3543
Number of columns: 9





In [38]:
import pandas as pd

input_file = 'test1.csv'
df = pd.read_csv(input_file)

# Aggregate rows by "Diary metadata" column and count rows
diary_counts = df.groupby('Diary').size().reset_index(name='counts')

# Display the result
print(diary_counts)

# Filter rows where 'Diary metadata' is '1894-1895'
diary_1894_1895 = df[df['Diary'] == '1894-1895']

        Diary  counts
0   1894-1805       1
1   1894-1895     125
2   1895-1896     100
3   1896-1898       8
4   1897-1898      80
5   1898-1899      55
6   1899-1902     124
7   1902-1903      99
8   1903-1904     182
9        1905     256
10       1906     334
11       1907     220
12       1908     272
13       1909      12
14       1915     107
15       1916      95
16       1922     186
17       1923     180
18       1926       1


In [39]:
import os

# Select the required columns
diary_1894_1895_filtered = diary_1894_1895[['Page (sequence)', "Place", 'Geonames URL']]
diary_1894_1895_filtered.columns = ['page', "value", 'geonames_url']
diary_1894_1895_filtered = diary_1894_1895_filtered[diary_1894_1895_filtered['geonames_url'].notna()]
diary_1894_1895_filtered['geonames_url'] = diary_1894_1895_filtered['geonames_url'].str.replace(r'/[^/]*$', '', regex=True)
diary_1894_1895_filtered['page'] = diary_1894_1895_filtered['page'].astype(int)

gt_path = '1894-95/ground_truth.csv'
os.makedirs(os.path.dirname(gt_path), exist_ok=True)
diary_1894_1895_filtered.to_csv(gt_path, index=False)

In [40]:
import os
import pandas as pd


# Load the CSV files
predictions = pd.read_csv(os.path.join(os.pardir, 'preprocess', 'predicted_places.csv'))
ground_truth = pd.read_csv(gt_path)

# Create test_df with rows having matching 'value' and 'page'
test_df = pd.merge(
  ground_truth[['page', 'value', 'geonames_url']].rename(columns={'geonames_url': 'ground_truth_geonames_url'}),
  predictions[['page', 'value', 'geonames_url']].rename(columns={'geonames_url': 'predicted_geonames_url'}),
  on=['page', 'value']
)

# Remove duplicates
test_df = test_df.drop_duplicates()

# Display the test dataframe
print(test_df)

test_df.to_csv(os.path.join("1894-95", 'test.csv'), index=False)

    page                       value         ground_truth_geonames_url  \
0      5                     Pistoia  https://www.geonames.org/3170621   
1     14                   Viareggio  https://www.geonames.org/3164433   
13    50                      Urbino  https://www.geonames.org/3165035   
14    51                      Urbino  https://www.geonames.org/3165035   
15    52                       Cagli  https://www.geonames.org/3181277   
16    53                      Gubbio  https://www.geonames.org/3175687   
17    57                       Milan  https://www.geonames.org/3173435   
19    58                       Zogno  https://www.geonames.org/3163854   
20    60                     Bergamo  https://www.geonames.org/3182164   
21    60                  Strassburg  https://www.geonames.org/2973783   
23    61                  Strassburg  https://www.geonames.org/2973783   
27    64                      London  https://www.geonames.org/2643743   
28    67                       Paris  

In [41]:
# Read the test.csv file
test_df = pd.read_csv('1894-95/test.csv')

# Calculate True Positives (TP), False Positives (FP), and False Negatives (FN)
TP = ((test_df['ground_truth_geonames_url'] == test_df['predicted_geonames_url']) & test_df['ground_truth_geonames_url'].notna()).sum()
FP = ((test_df['ground_truth_geonames_url'] != test_df['predicted_geonames_url']) & test_df['predicted_geonames_url'].notna()).sum()
FN = ((test_df['ground_truth_geonames_url'].notna()) & (test_df['predicted_geonames_url'].isna())).sum()

# Calculate Precision, Recall, and F1-Score
precision = TP / (TP + FP) if (TP + FP) > 0 else 0
recall = TP / (TP + FN) if (TP + FN) > 0 else 0
f1_score = 2 * (precision * recall) / (precision + recall) if (precision + recall) > 0 else 0

print(f"True Positives (TP): {TP}")
print(f"False Positives (FP): {FP}")
print(f"False Negatives (FN): {FN}")
print(f"Precision: {precision * 100:.2f}%")
print(f"Recall: {recall * 100:.2f}%")
print(f"F1-Score: {f1_score * 100:.2f}%")

True Positives (TP): 40
False Positives (FP): 3
False Negatives (FN): 0
Precision: 93.02%
Recall: 100.00%
F1-Score: 96.39%
