In [None]:
# Import relevant packages
import random
import nltk
nltk.download('punkt')
nltk.download('averaged_perceptron_tagger')

!pip install jsonlines
import jsonlines

import pandas as pd
import numpy as np
import xml.etree.ElementTree as ET
import tensorflow_datasets as tfds
import matplotlib.pyplot as plt
import statsmodels.stats.multicomp as multi

# Build dataset

In [None]:
# List of sentences from the CLAIRE dataset that are grammatical
grammatical_test = [6, 8, 9, 12, 14, 16, 17, 29, 30, 36, 44, 47, 49, 50, 53, 65, 68, 70, 74, 78, 80, 83, 86, 90, 95, 96, 109, 113, 117, 120, 123, 125, 127, 128, 132, 133, 135, 137, 140, 142, 144, 157, 158, 160, 161, 163, 169, 171, 174, 175, 177, 179, 181, 184, 185, 186, 189, 190, 192, 193, 194, 195, 199, 207, 212, 216, 218, 219, 220, 224, 226, 228, 230, 233, 234, 236, 239, 240, 243, 245, 246, 248, 250, 252, 254, 255, 256, 258, 260, 262, 263, 264, 268, 269, 271, 273, 275, 277, 278, 279, 281, 282, 283, 285, 286, 287, 288, 289, 290, 292, 294, 296, 298, 302, 303, 304, 305, 309, 311, 312, 313, 316, 318, 320, 321, 322, 324, 325, 327, 328, 329, 330, 331, 333, 334, 335, 336, 338, 340, 341, 346, 348, 349, 350, 351, 354, 355, 356, 357, 359, 360, 361, 362, 363, 365, 366, 367, 369, 370, 371, 374, 380, 388, 390, 394, 402, 420, 423, 436, 441, 454, 469, 483, 485]
grammatical_dev = [3, 12, 17, 19, 29, 41, 45, 46, 47, 48, 51, 52, 56, 60, 64, 75, 87, 93, 101, 116, 117, 119, 124, 128, 129, 132, 133, 134, 135, 138, 142, 144, 147, 151, 156, 157, 158, 159, 161, 163, 167, 170, 171, 177, 178, 179, 180, 184, 185, 187, 191, 193, 197, 199, 200, 202, 204, 205, 206, 208, 209, 210, 211, 212, 214, 215, 220, 222, 224, 225, 227, 231, 232, 237, 238, 242, 243, 250, 251, 253, 254, 258, 260, 261, 262, 263, 264, 267, 268, 270, 271, 273, 274, 276, 277, 278, 280, 281, 282, 283, 284, 285, 288, 290, 291, 293, 294, 295, 296, 298, 299, 300, 301, 302, 303, 304, 305, 306, 307, 308, 311, 312, 313, 314, 316, 317, 318, 319, 322, 323, 324, 325, 326, 327, 328, 330, 331, 335, 336, 338, 341, 342, 343, 344, 346, 347, 348, 349, 350, 352, 354, 356, 357, 358, 359, 360, 363, 364, 366, 368, 369, 370, 380, 408, 454]
grammatical_train = [6, 12, 13, 14, 21, 30, 39, 49, 51, 53, 58, 59, 60, 62, 63, 65, 66, 68, 74, 88, 97, 115, 116, 118, 138, 141, 144, 147, 162, 165, 175, 177, 181, 202, 203, 213, 219, 221, 238, 241, 245, 248, 255, 262, 265, 272, 279, 280, 293, 294, 298, 308, 319, 320, 329, 335, 350, 353, 355, 357, 364, 379, 388, 394, 402, 403, 417, 424, 431, 442, 449, 454, 458, 464, 471, 473, 479, 482, 483, 487, 495, 499, 500, 511, 514, 517, 530, 541, 547, 549, 553, 556, 559, 562, 563, 566, 597, 598, 599, 600, 604, 611, 613, 614, 616, 623, 628, 630, 631, 635, 638, 648, 649, 653, 654, 664, 672, 677, 678, 686, 690, 699, 701, 709, 717, 724, 727, 735, 738, 746, 755, 758, 767, 775, 777, 783, 784, 786, 799, 805, 825, 838, 843, 854, 860, 863, 866, 867, 873, 886, 889, 891, 899, 905, 912, 913, 921, 922, 937, 945, 950, 953, 960, 963, 971, 972, 975, 976, 977, 978, 980, 981, 984, 986, 987, 989, 990, 993, 997, 999, 1002, 1010, 1015, 1016, 1017, 1020, 1025, 1029, 1032, 1036, 1037, 1039, 1040, 1041, 1042, 1045, 1046, 1050, 1053, 1058, 1059, 1060, 1061, 1074, 1075, 1076, 1079, 1081, 1087, 1088, 1089, 1091, 1093, 1095, 1098, 1100, 1101, 1104, 1105, 1107, 1112, 1113, 1114, 1116, 1119, 1124, 1126, 1127, 1129, 1133, 1134, 1135, 1137, 1138, 1139, 1144, 1147, 1148, 1152, 1153, 1158, 1159, 1163, 1165, 1166, 1169, 1172, 1173, 1180, 1181, 1182, 1184, 1185, 1186, 1187, 1188, 1190, 1199, 1201, 1202, 1206, 1209, 1211, 1213, 1216, 1217, 1225, 1226, 1227, 1229, 1233, 1234, 1238, 1242, 1244, 1248, 1249, 1251, 1252, 1255, 1256, 1258, 1263, 1264, 1266, 1267, 1269, 1272, 1273, 1274, 1275, 1279, 1281, 1283, 1286, 1287, 1290, 1291, 1292, 1293, 1294, 1300, 1301, 1303, 1304, 1306, 1313, 1315, 1319, 1321, 1322, 1323, 1325, 1332, 1344, 1347, 1351, 1352, 1353, 1354, 1355, 1357, 1358, 1367, 1368, 1370, 1371, 1373, 1374, 1375, 1377, 1379, 1382, 1386, 1387, 1389, 1392, 1393, 1394, 1396, 1397, 1399, 1401, 1403, 1404, 1406, 1408, 1411, 1413, 1416, 1419, 1422, 1424, 1425, 1427, 1428, 1429, 1430, 1431, 1434, 1436, 1438, 1442, 1443, 1444, 1445, 1446, 1447, 1448, 1450, 1451, 1453, 1455, 1457, 1458, 1461, 1462, 1463, 1465, 1466, 1469, 1473, 1475, 1477, 1479, 1482, 1484, 1485, 1486, 1488, 1490, 1491, 1493, 1495, 1496, 1497, 1500, 1506, 1508, 1509, 1510, 1513, 1516, 1518, 1520, 1522, 1528, 1532, 1536, 1539, 1543, 1544, 1545, 1547, 1549, 1550, 1551, 1554, 1555, 1556, 1557, 1559, 1562, 1566, 1567, 1573, 1574, 1575, 1577, 1578, 1581, 1585, 1593, 1594, 1595, 1596, 1599, 1601, 1602, 1603, 1605, 1606, 1607, 1609, 1610, 1612, 1613, 1614, 1615, 1617, 1618, 1619, 1621, 1623, 1624, 1626, 1628, 1629, 1633, 1634, 1637, 1640, 1643, 1647, 1648, 1652, 1656, 1658, 1659, 1660, 1664, 1666, 1668, 1669, 1674, 1677, 1682, 1686, 1689, 1690, 1692, 1698, 1699, 1701, 1705, 1706, 1708, 1713, 1714, 1720, 1723, 1724, 1728, 1729, 1730, 1733, 1737, 1738, 1739, 1741, 1742, 1743, 1744, 1752, 1753, 1754, 1758, 1759, 1762, 1763, 1765, 1768, 1774, 1775, 1776, 1778, 1782, 1786, 1787, 1791, 1792, 1794, 1798, 1799, 1800, 1807, 1808, 1809, 1811, 1813, 1814, 1815, 1821, 1824, 1825, 1827, 1828, 1829, 1831, 1836, 1837, 1842, 1844, 1845, 1846, 1849, 1854, 1861, 1863, 1865, 1868, 1869, 1873, 1875, 1877, 1878, 1879, 1881, 1885, 1886, 1889, 1890, 1892, 1893, 1896, 1899, 1900, 1902, 1904, 1908, 1911, 1913, 1916, 1917, 1923, 1924, 1927, 1928, 1929, 1932, 1934, 1937, 1938, 1944, 1945, 1949, 1951, 1952, 1953, 1954, 1957, 1958, 1959, 1961, 1962, 1963, 1964, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 1975, 1976, 1977, 1979, 1981, 1982, 1983, 1985, 1986, 1987, 1990, 1991, 1992, 1994, 1995, 1996, 1997, 1998, 1999, 2002, 2003, 2004, 2005, 2007, 2008, 2009, 2010, 2011, 2012, 2013, 2015, 2017, 2020, 2023, 2024, 2025, 2026, 2028, 2029, 2030, 2031, 2033, 2039, 2040, 2041, 2043, 2044, 2045, 2046, 2047, 2048, 2050, 2051, 2052, 2055, 2056, 2057, 2058, 2060, 2062, 2063, 2065, 2066, 2067, 2069, 2071, 2073, 2075, 2077, 2078, 2079, 2081, 2082, 2090, 2091, 2092, 2094, 2097, 2098, 2099, 2101, 2102, 2104, 2105, 2106, 2108, 2109, 2111, 2112, 2114, 2115, 2117, 2120, 2122, 2125, 2126, 2127, 2131, 2132, 2134, 2135, 2136, 2137, 2138, 2139, 2141, 2142, 2144, 2145, 2146, 2150, 2151, 2152, 2154, 2155, 2158, 2159, 2162, 2163, 2164, 2165, 2167, 2170, 2172, 2173, 2176, 2179, 2180, 2181, 2182, 2183, 2184, 2186, 2187, 2189, 2190, 2191, 2192, 2193, 2195, 2196, 2200, 2202, 2209, 2210, 2212, 2214, 2217, 2218, 2220, 2221, 2222, 2223, 2224, 2226, 2229, 2230, 2231, 2233, 2234, 2235, 2236, 2239, 2240, 2241, 2243, 2244, 2246, 2250, 2253, 2254, 2255, 2256, 2257, 2259, 2260, 2262, 2264, 2265, 2266, 2267, 2269, 2270, 2271, 2275, 2276, 2277, 2278, 2280, 2286, 2291, 2292, 2293, 2294, 2296, 2298, 2299, 2300, 2303, 2304, 2305, 2306, 2307, 2309, 2310, 2311, 2314, 2315, 2316, 2318, 2320, 2322, 2323, 2325, 2326, 2327, 2330, 2331, 2334, 2336, 2337, 2339, 2340, 2343, 2344, 2345, 2346, 2347, 2349, 2352, 2353, 2355, 2357, 2358, 2359, 2360, 2361, 2362, 2363, 2364, 2366, 2368, 2369, 2371, 2373, 2374, 2375, 2376, 2377, 2380, 2381, 2384, 2389, 2390, 2391, 2392, 2394, 2395, 2396, 2399, 2401, 2404, 2407, 2410, 2412, 2413, 2414, 2418, 2419, 2421, 2422, 2423, 2425, 2426, 2427, 2428, 2429, 2430, 2432, 2433, 2434, 2435, 2436, 2438, 2439, 2442, 2446, 2449, 2451, 2452, 2454, 2457, 2459, 2460, 2461, 2462, 2463, 2467, 2468, 2469, 2471, 2472, 2473, 2476, 2478, 2479, 2480, 2481, 2483, 2484, 2485, 2486, 2487, 2490, 2492, 2493, 2495, 2499, 2501, 2502, 2503, 2504, 2506, 2507, 2508, 2509, 2512, 2514, 2515, 2517, 2518, 2519, 2520, 2523, 2524, 2528, 2530, 2531, 2534, 2536, 2538, 2539, 2540, 2543, 2545, 2548, 2549, 2550, 2551, 2552, 2556, 2557, 2558, 2564, 2566, 2567, 2568, 2569, 2571, 2572, 2574, 2575, 2576, 2580, 2583, 2584, 2586, 2588, 2589, 2590, 2591, 2594, 2595, 2596, 2597, 2599, 2600, 2601, 2602, 2603, 2605, 2606, 2607, 2608, 2609, 2610, 2613, 2614, 2615, 2617, 2618, 2619, 2622, 2623, 2625, 2626, 2628, 2629, 2630, 2632, 2634, 2636, 2637, 2638, 2639, 2640, 2642, 2644, 2647, 2649, 2650, 2652, 2653, 2655, 2657, 2658, 2659, 2660, 2663, 2665, 2666, 2673, 2676, 2679, 2681, 2682, 2683, 2685, 2686, 2687, 2688, 2689, 2691, 2692, 2693, 2694, 2695, 2696, 2700, 2701, 2702, 2703, 2704, 2706, 2708, 2709, 2710, 2711, 2712, 2716, 2717, 2718, 2719, 2721, 2723, 2725, 2726, 2728, 2729, 2734, 2735, 2736, 2738, 2741, 2743, 2746, 2747, 2748, 2749, 2750, 2752, 2753, 2754, 2757, 2758, 2762, 2764, 2765, 2766, 2767, 2770, 2774, 2776, 2777, 2778, 2779, 2780, 2781, 2782, 2783, 2784, 2785, 2786, 2787, 2788, 2789, 2790, 2791, 2792, 2793, 2794, 2795, 2796, 2801, 2802, 2804, 2805, 2806, 2807, 2808, 2809, 2812, 2813, 2814, 2815, 2816, 2818, 2819, 2820, 2821, 2822, 2823, 2826, 2827, 2828, 2829, 2830, 2832, 2835, 2836, 2837, 2838, 2840, 2841, 2843, 2848, 2849, 2850, 2853, 2859, 2860, 2861, 2863, 2865, 2867, 2871, 2872, 2873, 2874, 2876, 2877, 2878, 2879, 2880, 2882, 2883, 2885, 2886, 2887, 2889, 2890, 2895, 2896, 2897, 2903, 2904, 2905, 2906, 2907, 2908, 2910, 2911, 2913, 2914, 2915, 2916, 2917, 2918, 2919, 2920, 2921, 2922, 2924, 2926, 2930, 2933, 2936, 2937, 2938, 2941, 2943, 2946, 2948, 2949, 2950, 2951, 2959, 2970, 2972, 2987, 3012, 3024, 3051, 3057, 3109, 3119, 3120, 3145, 3151, 3161, 3163, 3179, 3184, 3189, 3217, 3242, 3244, 3252, 3264, 3281, 3293, 3323, 3346, 3350, 3363, 3378, 3385, 3388, 3392, 3412, 3424, 3425, 3485, 3487, 3503, 3508, 3523, 3535, 3590, 3600, 3602, 3622, 3644, 3673, 3690, 3694, 3700, 3717, 3721, 3736, 3745, 3750, 3755, 3795, 3800, 3821, 3825, 3830, 3854, 3855, 3857, 3873, 3881, 3893, 3894, 3897, 3915, 3922, 3926, 3934]

In [None]:
# Create main dataframe
data = pd.DataFrame(columns = ['class', 'subclass', 'dataset', 'text', 'minimalpair'])
# Create control data dataframe
control_data = pd.DataFrame(columns = ['class', 'subclass', 'dataset', 'text', 'minimalpair'])

In [None]:
# Set counter for minimal pairs
i = 0

Winograd Schema 273 sentences

In [None]:
# Add WSC data to main dataset
ds = tfds.load("wsc273", split="test", shuffle_files=True)
textstoadd = []
for example in ds:
  textstoadd.append(bytes.decode(example['text'].numpy()))
textstoadd.sort()
datatoadd = {'class' : [3] * len(textstoadd), 'subclass' : ['Referential ambiguity'] * len(textstoadd),
             'dataset': ['WSC273'] * len(textstoadd), 'text' : textstoadd, 'minimalpair': list(range(i, i+len(textstoadd)))}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)

# Add fixed WSC273 data to control dataset
with open('WSC273_unambiguous.txt') as file:
  text = list(map(lambda x: x.strip(), file.readlines()))
  datatoadd = {'class' : [3] * len(text), 'subclass' : ['Referential ambiguity'] * len(text),
             'dataset': ['WSC273'] * len(text), 'text' : text, 'minimalpair': list(range(i, i+len(textstoadd)))}
  control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)

Language and Vision Ambiguities Dataset

In [None]:
# Add LAVA sentences to dataset and control dataset
df2 = pd.read_json('lava.json')
df2 = df2.sample(frac=1).drop_duplicates(subset=["text"]).reset_index(drop=True)
# PP Attachment ambiguity
datatoadd = {'class' : [2] * len(df2['text'][:32]), 'subclass' : ['PP attachment ambiguity'] * len(df2['text'][:32]),
             'dataset' : ['LAVA'] * len(df2['text'][:32]), 'text' : df2['text'][:32].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][:32])))}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)
datatoadd = {'class' : [2] * len(df2['unambiguoustext'][:32]), 'subclass' : ['PP attachment ambiguity'] * len(df2['unambiguoustext'][:32]),
             'dataset': ['LAVA'] * len(df2['unambiguoustext'][:32]), 'text' : df2['unambiguoustext'][:32].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][:32])))}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)
# VP Attachment ambiguity
datatoadd = {'class' : [2] * len(df2['text'][32:92]), 'subclass' : ['VP attachment ambiguity'] * len(df2['text'][32:92]),
             'dataset' : ['LAVA'] * len(df2['text'][32:92]), 'text' : df2['text'][32:92].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][32:92])))}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)
datatoadd = {'class' : [2] * len(df2['unambiguoustext'][32:92]), 'subclass' : ['VP attachment ambiguity'] * len(df2['unambiguoustext'][32:92]),
             'dataset': ['LAVA'] * len(df2['unambiguoustext'][32:92]), 'text' : df2['unambiguoustext'][32:92].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][32:92])))}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)
# Scopal ambiguity
datatoadd = {'class' : [1] * len(df2['text'][92:132]), 'subclass' : ['Scopal ambiguity'] * len(df2['text'][92:132]),
             'dataset' : ['LAVA'] * len(df2['text'][92:132]), 'text' : df2['text'][92:132].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][92:132])))}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)
datatoadd = {'class' : [1] * len(df2['unambiguoustext'][92:132]), 'subclass' : ['Scopal ambiguity'] * len(df2['unambiguoustext'][92:132]),
             'dataset': ['LAVA'] * len(df2['unambiguoustext'][92:132]), 'text' : df2['unambiguoustext'][92:132].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][92:132])))}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)
# PP Attachment ambiguity
datatoadd = {'class' : [2] * len(df2['text'][132:148]), 'subclass' : ['PP attachment ambiguity'] * len(df2['text'][132:148]),
             'dataset' : ['LAVA'] * len(df2['text'][132:148]), 'text' : df2['text'][132:148].tolist()}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)
datatoadd = {'class' : [2] * len(df2['unambiguoustext'][132:148]), 'subclass' : ['PP attachment ambiguity'] * len(df2['unambiguoustext'][132:148]),
             'dataset': ['LAVA'] * len(df2['unambiguoustext'][132:148]), 'text' : df2['unambiguoustext'][132:148].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][132:148])))}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)
# Logical form
datatoadd = {'class' : [3] * len(df2['text'][148:183]), 'subclass' : ['Missing information'] * len(df2['text'][148:183]),
            'dataset' : ['LAVA'] * len(df2['text'][148:183]), 'text' : df2['text'][148:183].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][148:183])))}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)
datatoadd = {'class' : [3] * len(df2['unambiguoustext'][148:183]), 'subclass' : ['Missing information'] * len(df2['unambiguoustext'][148:183]),
             'dataset': ['LAVA'] * len(df2['unambiguoustext'][148:183]), 'text' : df2['unambiguoustext'][148:183].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][148:183])))}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)
# Anaphora
datatoadd = {'class' : [3] * len(df2['text'][183:219]), 'subclass' : ['Referential ambiguity'] * len(df2['text'][183:219]),
             'dataset' : ['LAVA'] * len(df2['text'][183:219]), 'text' : df2['text'][183:219].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][183:219])))}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)
datatoadd = {'class' : [3] * len(df2['unambiguoustext'][183:219]), 'subclass' : ['Referential ambiguity'] * len(df2['unambiguoustext'][183:219]),
             'dataset': ['LAVA'] * len(df2['unambiguoustext'][183:219]), 'text' : df2['unambiguoustext'][183:219].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][183:219])))}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)
# Ellipsis
datatoadd = {'class' : [3] * len(df2['text'][219:]), 'subclass' : ['Missing information'] * len(df2['text'][219:]),
             'dataset' : ['LAVA'] * len(df2['text'][219:]), 'text' : df2['text'][219:].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][219:])))}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)
datatoadd = {'class' : [3] * len(df2['unambiguoustext'][219:]), 'subclass' : ['Missing information'] * len(df2['unambiguoustext'][219:]),
             'dataset': ['LAVA'] * len(df2['unambiguoustext'][219:]), 'text' : df2['unambiguoustext'][219:].tolist(),
             'minimalpair': list(range(i, i+len(df2['text'][219:])))}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)

CLArifying Insertions from Revision Edits dataset

In [None]:
for dataset_type, grammatical in zip(['train', 'dev', 'test'], [grammatical_train, grammatical_dev, grammatical_test]):
  # Load CLAIRE data and add CLAIRE sentences to dataframe
  claire_data = pd.DataFrame(columns = ['class', 'subclass', 'dataset', 'text'])
  df3 = pd.read_csv('CLAIRE_' + dataset_type + '_data.tsv', sep='\t')
  df3['fixed text'] = df3['Sentence'].apply(lambda x: x.replace(' ______', ''))
  # Implicit reference
  df3_2 = df3.loc[df3['Resolved pattern'] == 'IMPLICIT REFERENCE']
  datatoadd = {'class' : [3] * len(df3_2['fixed text'].unique()), 'subclass' : ['Implicit reference'] * len(df3_2['fixed text'].unique()),
              'dataset' : ['CLAIRE'] * len(df3_2['fixed text'].unique()), 'text' : list(df3_2['fixed text'].unique())}
  claire_data = pd.concat([claire_data, pd.DataFrame(datatoadd)], ignore_index=True)
  # Fused head
  df3_2 = df3.loc[df3['Resolved pattern'] == 'FUSED HEAD']
  datatoadd = {'class' : [3] * len(df3_2['fixed text'].unique()), 'subclass' : ['Fused head'] * len(df3_2['fixed text'].unique()),
              'dataset' : ['CLAIRE'] * len(df3_2['fixed text'].unique()), 'text' : list(df3_2['fixed text'].unique())}
  claire_data = pd.concat([claire_data, pd.DataFrame(datatoadd)], ignore_index=True)
  # Added compound
  df3_2 = df3.loc[df3['Resolved pattern'] == 'ADDED COMPOUND']
  datatoadd = {'class' : [3] * len(df3_2['fixed text'].unique()), 'subclass' : ['Added compound'] * len(df3_2['fixed text'].unique()),
              'dataset' : ['CLAIRE'] * len(df3_2['fixed text'].unique()), 'text' : list(df3_2['fixed text'].unique())}
  claire_data = pd.concat([claire_data, pd.DataFrame(datatoadd)], ignore_index=True)
  # Metonymic reference
  df3_2 = df3.loc[df3['Resolved pattern'] == 'METONYMIC REFERENCE']
  datatoadd = {'class' : [3] * len(df3_2['fixed text'].unique()), 'subclass' : ['Metonymic reference'] * len(df3_2['fixed text'].unique()),
              'dataset' : ['CLAIRE'] * len(df3_2['fixed text'].unique()), 'text' : list(df3_2['fixed text'].unique())}
  claire_data = pd.concat([claire_data, pd.DataFrame(datatoadd)], ignore_index=True)
  claire_data = claire_data[claire_data.index.isin(grammatical)]
  claire_data['minimalpair'] = list(range(i, i+len(claire_data['text'])))
  data = pd.concat([data, claire_data], ignore_index=True)

  # Add fixed CLAIRE sentences to dataframe
  claire_fixed_data = pd.DataFrame(columns = ['class', 'subclass', 'dataset', 'text'])

  df3 = pd.read_csv('CLAIRE_' + dataset_type + '_data.tsv', sep='\t')
  df3_scores = pd.read_csv('CLAIRE_' + dataset_type + '_scores.tsv', sep='\t', header=None)

  def find_fixed_sentence(sentence):
    i = df3['Sentence'].tolist().index(sentence)
    j = i + 1 if dataset_type == "train" else i
    assert j == int(df3_scores[0][i*5].split('_')[0])
    assert j == int(df3_scores[0][i*5+4].split('_')[0])
    score = df3_scores[1][i*5:i*5+5].tolist().index(max(df3_scores[1][i*5:i*5+5]))
    return sentence.replace('______', df3['Filler' + str(score + 1)][i])
  df3['fixed text'] = df3['Sentence'].apply(find_fixed_sentence)

  # Implicit reference
  df3_2 = df3.loc[df3['Resolved pattern'] == 'IMPLICIT REFERENCE']
  datatoadd = {'class' : [3] * len(df3_2['fixed text'].unique()), 'subclass' : ['Implicit reference'] * len(df3_2['fixed text'].unique()),
              'dataset' : ['CLAIRE'] * len(df3_2['fixed text'].unique()), 'text' : list(df3_2['fixed text'].unique())}
  claire_fixed_data = pd.concat([claire_fixed_data, pd.DataFrame(datatoadd)], ignore_index=True)
  # Fused head
  df3_2 = df3.loc[df3['Resolved pattern'] == 'FUSED HEAD']
  datatoadd = {'class' : [3] * len(df3_2['fixed text'].unique()), 'subclass' : ['Fused head'] * len(df3_2['fixed text'].unique()),
              'dataset' : ['CLAIRE'] * len(df3_2['fixed text'].unique()), 'text' : list(df3_2['fixed text'].unique())}
  claire_fixed_data = pd.concat([claire_fixed_data, pd.DataFrame(datatoadd)], ignore_index=True)
  # Added compound
  df3_2 = df3.loc[df3['Resolved pattern'] == 'ADDED COMPOUND']
  datatoadd = {'class' : [3] * len(df3_2['fixed text'].unique()), 'subclass' : ['Added compound'] * len(df3_2['fixed text'].unique()),
              'dataset' : ['CLAIRE'] * len(df3_2['fixed text'].unique()), 'text' : list(df3_2['fixed text'].unique())}
  claire_fixed_data = pd.concat([claire_fixed_data, pd.DataFrame(datatoadd)], ignore_index=True)
  # Metonymic reference
  df3_2 = df3.loc[df3['Resolved pattern'] == 'METONYMIC REFERENCE']
  datatoadd = {'class' : [3] * len(df3_2['fixed text'].unique()), 'subclass' : ['Metonymic reference'] * len(df3_2['fixed text'].unique()),
              'dataset' : ['CLAIRE'] * len(df3_2['fixed text'].unique()), 'text' : list(df3_2['fixed text'].unique())}
  claire_fixed_data = pd.concat([claire_fixed_data, pd.DataFrame(datatoadd)], ignore_index=True)
  claire_fixed_data = claire_fixed_data[claire_fixed_data.index.isin(grammatical)]
  claire_fixed_data['minimalpair'] = list(range(i, i+len(claire_fixed_data['text'])))
  control_data = pd.concat([control_data, claire_fixed_data], ignore_index=True)

  i = len(control_data)

AmbiEnt Datset (not included in final dataset; used for validating experiments)

In [None]:
underspecified_lines = []
specified_lines = []
with jsonlines.open('AmbiEnt_dev.jsonl') as reader:
  lines = []
  for line in reader:
    lines.append(line)
with jsonlines.open('AmbiEnt_test.jsonl') as reader:
  lines = []
  for line in reader:
    lines.append(line)
for item in lines:
  if item['premise_ambiguous']:
    underspecified_lines.append(item['premise'])
    specified_lines.append(item['disambiguations'][random.randint(0, 1)]['premise'])
  elif item['hypothesis_ambiguous']:
    underspecified_lines.append(item['hypothesis'])
    specified_lines.append(item['disambiguations'][random.randint(0, 1)]['hypothesis'])

datatoadd = {'class' : [0] * len(underspecified_lines), 'subclass' : ['Various'] * len(underspecified_lines),
             'dataset' : ['AmbiEnt'] * len(underspecified_lines), 'text' : underspecified_lines,
             'minimalpair': list(range(i, i+len(underspecified_lines)))}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)
datatoadd = {'class' : [0] * len(specified_lines), 'subclass' : ['Various'] * len(specified_lines),
             'dataset' : ['AmbiEnt'] * len(specified_lines), 'text' : specified_lines,
             'minimalpair': list(range(i, i+len(specified_lines)))}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)

Deictic Expressions

In [None]:
# Helper functions for deictic data
def intersection(lst1, lst2):
    lst3 = [value for value in lst1 if value in lst2]
    return lst3

def findsentenceswithword(dataset, word, amount):
  output = []
  for sentence in random.choices(dataset, k=len(dataset)):
    if word in sentence.strip().split():
      output.append(sentence.strip())
    if len(output) >= amount:
      break
  return output

# Add deictic data to dataframe
deictic_data = []
deictic_wordslist = ["I", "you", "we", "she", "they", "here", "there", "that", "this", "now", "then", "tonight", "tomorrow", "yesterday", "left" and "right"]
with open("wiki_sample.txt") as file:
  wikipedia_data = file.readlines()
  for word in deictic_wordslist:
    deictic_data += findsentenceswithword(wikipedia_data, word, 10)
datatoadd = {'class' : [3] * len(deictic_data), 'subclass' : ['Deixis'] * len(deictic_data),
             'dataset' : ['Wikipedia'] * len(deictic_data),  'text' : deictic_data,
             'minimalpair': list(range(i, i+len(deictic_data)))}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)
deictic_control_data = []
with open('wiki_sample.txt') as file:
  wikipedia_data = file.readlines()
  for sentence in random.choices(wikipedia_data, k=len(wikipedia_data)):
    if len(deictic_control_data) >= len(datatoadd['class']):
      break
    if intersection(nltk.tokenize.word_tokenize(sentence), deictic_wordslist) == []:
      deictic_control_data.append(sentence.strip())
datatoadd = {'class' : [3] * len(deictic_control_data), 'subclass' : ['Deixis'] * len(deictic_control_data),
             'dataset' : ['Wikipedia'] * len(deictic_control_data),  'text' : deictic_control_data,
             'minimalpair': list(range(i, i+len(deictic_control_data)))}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)

Homonymic Expressions

In [None]:
# Add homomymy data to dataframe
homonymy_data = []
homonymy_wordslist = pd.read_csv("Homonyms.csv")['Word'].unique()
with open("wiki_sample.txt") as file:
  wikipedia_data = file.readlines()
  for word in homonymy_wordslist:
    homonymy_data += findsentenceswithword(wikipedia_data, word, 10)
datatoadd = {'class' : [4] * len(homonymy_data), 'subclass' : ['Homonymy'] * len(homonymy_data),
             'dataset' : ['Wikipedia'] * len(homonymy_data),  'text' : homonymy_data,
             'minimalpair': list(range(i, i+len(homonymy_data)))}
data = pd.concat([data, pd.DataFrame(datatoadd)], ignore_index=True)
homonymy_control_data = []
with open('wiki_sample.txt') as file:
  wikipedia_data = file.readlines()
  for sentence in random.choices(wikipedia_data, k=len(wikipedia_data)):
    if len(homonymy_control_data) >= len(datatoadd['class']):
      break
    if intersection(nltk.tokenize.word_tokenize(sentence), homonymy_wordslist) == []:
      homonymy_control_data.append(sentence.strip())
datatoadd = {'class' : [4] * len(homonymy_control_data), 'subclass' : ['Homonymy'] * len(homonymy_control_data),
             'dataset' : ['Wikipedia'] * len(homonymy_control_data),  'text' : homonymy_control_data,
             'minimalpair': list(range(i, i+len(homonymy_control_data)))}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)
i = len(control_data)

Industrial requirements document (not used in final dataset)

In [None]:
# Load FN-REQ data and add to control data dataframe
requirements_texts = []
for i in range(1,81):
  try:
    filename = 'FN-REQ-' + '0' * (3 - len(str(i))) + str(i) + '.xml'
    tree = ET.parse(filename)
    for child in tree.getroot():
      if not child[0].text.isspace():
        requirements_texts.append(child[0].text)
  except:
    continue
datatoadd = {'class' : ['Control'] * len(requirements_texts[:947]), 'subclass' : ['Industrial requirements'] * len(requirements_texts[:947]),
             'dataset' : ['FN-REQ'] * len(requirements_texts[:947]), 'text' : requirements_texts[:947]}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)

Wikipedia sentences (not used in final dataset)

In [None]:
# Load Wikipedia dataset and fill the rest of the control data dataframe
with open("wikisent2.txt") as file:
    wikipedia_data = file.readlines()
wikipedia_data = random.choices(wikipedia_data, k=5000)
wikipedia_data = [x.strip() for x in wikipedia_data]
required_len = len(data['text']) - len(control_data['text'])
datatoadd = {'class' : ['Control'] * required_len, 'subclass' : ['Wikipedia'] * required_len,
             'dataset' : ['Wikipedia'] * required_len, 'text' : wikipedia_data[:required_len]}
control_data = pd.concat([control_data, pd.DataFrame(datatoadd)], ignore_index=True)

# Calculations for descriptive statistics

Word length & sentence length

In [None]:
# Calculate length statistics for data
data['length in words'] = data['text'].str.split().apply(len)
data['word length'] = data['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))
data['length in characters'] = data['text'].str.len()
# Calculate length statistics for control data
control_data['length in words'] = control_data['text'].str.split().apply(len)
control_data['word length'] = control_data['text'].apply(lambda x: sum(len(word) for word in x.split()) / len(x.split()))
control_data['length in characters'] = control_data['text'].str.len()

Vocabulary size

In [None]:
# Calculate vocabulary size
vocabulary_mean = []
for i in range(1,5):
  combined_text = ' '.join(data[data['class'] == i ]['text'].tolist())
  vocabulary_mean.append(len(set(combined_text.split())))
combined_text = ' '.join(data['text'].tolist())
vocabulary_mean.append(len(set(combined_text.split())))
vocabulary_control_mean = []
for i in range(1,5):
  combined_control_text = ' '.join(control_data[control_data['class'] == i]['text'].tolist())
  vocabulary_control_mean.append(len(set(combined_control_text.split())))
combined_control_text = ' '.join(control_data['text'].tolist())
vocabulary_control_mean.append(len(set(combined_control_text.split())))

Concreteness ratings

In [None]:
# Load concreteness ratings and calculate conreteness for data
df4 = pd.read_csv('Concreteness_ratings_Brysbaert_et_al_BRM.txt', sep='\t')
concreteness_dict = dict(zip(df4['Word'], df4['Conc.M']))
def find_concreteness(word):
  try:
    return concreteness_dict[word]
  except:
    return df4['Conc.M'].mean()

data['average concreteness'] = data['text'].apply(lambda x: sum(find_concreteness(word) for word in x.split()) / len(x.split()))
control_data['average concreteness'] = control_data['text'].apply(lambda x: sum(find_concreteness(word) for word in x.split()) / len(x.split()))

Age of acquisition scores

In [None]:
# Load AoA ratings and calculate AoA for data
df5 = pd.read_excel("AoA_ratings_Kuperman_et_al_BRM.xlsx")
AoA_dict = dict(zip(df5['Word'], df5['Rating.Mean']))
def find_AoA(word):
  try:
    return AoA_dict[word]
  except:
    return df5['Rating.Mean'].mean()

data['average AoA'] = data['text'].apply(lambda x: sum(find_AoA(word) for word in x.split()) / len(x.split()))
control_data['average AoA'] = control_data['text'].apply(lambda x: sum(find_AoA(word) for word in x.split()) / len(x.split()))

Word frequency scores

In [None]:
df6 = pd.read_csv("unigram_freq.csv")
df6['count'] = df6['count']/max(df6['count'])

word_freq_dict = dict(zip(df6['word'], df6['count']))
def find_freq(word):
  try:
    return word_freq_dict[word]
  except:
    return df6['count'].mean()

data['Average word frequency'] = data['text'].apply(lambda x: sum(find_freq(word) for word in x.split()) / len(x.split()))
control_data['Average word frequency'] = control_data['text'].apply(lambda x: sum(find_freq(word) for word in x.split()) / len(x.split()))

Print summaries

In [None]:
# Calculate summaries of data
def print_means(dataset, vocabulary):
  summary_mean = dataset.groupby(['subclass']).mean()
  overall_mean = pd.DataFrame(dataset[['length in words', 'length in characters', 'word length', 'average concreteness', 'average AoA', "Average word frequency"]].mean()).T
  overall_mean.index = ['Overall']
  summary_mean = pd.concat([summary_mean, overall_mean])
  summary_mean['vocabulary size'] = vocabulary
  return summary_mean

def print_stds(dataset):
  summary_std = dataset.groupby(['subclass']).std()
  overall_std = pd.DataFrame(dataset[['length in words', 'length in characters', 'word length', 'average concreteness', 'average AoA', "Average word frequency"]].std()).T
  overall_std.index = ['Overall']
  summary_std = pd.concat([summary_std, overall_std])
  return summary_std

# Remove commented out code to print standard deviations

print(print_means(data,vocabulary_mean))
print('------------------')
print(print_means(control_data, vocabulary_control_mean))
# print('------------------')
# print(print_stds(data))
# print('------------------')
# print(print_stds(control_data))



# Visualization

In [None]:
def violinplotter(statistic, subdivision):
  temp_data = data
  temp_data['underspecified'] = [True] * len(temp_data)
  temp_control_data = control_data
  temp_control_data['underspecified'] = [False] * len(temp_control_data)
  temp = pd.concat([temp_data, temp_control_data])
  plt.figure(figsize=(10,6))
  ax = seaborn.violinplot(temp, x=subdivision, y='OPT-13b perplexity', hue='underspecified', , cut=0)
  #ax.set_ylim(0,1500)
  plt.xticks(rotation = 90)
  plt.savefig(statistic + "_by_" + subdivision + '.png', bbox_inches='tight')
violinplotter('Average word frequency', 'subclass')

# Miscellaneous

Calculate significant differences for descriptive statistics

In [None]:
# Calculate significant differences for descriptive statistics
import scipy.stats as stats
classes = data['subclass'].unique() # Change to 'class' for significant differnces per class
classes = np.sort(classes)
print(classes)
relevant = data
control_relevant = control_data
statistic = 'Average word frequency'
row_of_matrix = []
_, p = stats.mannwhitneyu(relevant[statistic], control_relevant[statistic])
row_of_matrix.append(p)
for klasje in classes:
  _, p = stats.mannwhitneyu(relevant[relevant['subclass'] == klasje][statistic], control_relevant[control_relevant['subclass'] == klasje][statistic])
  row_of_matrix.append(p)

print(row_of_matrix)

Save data

In [None]:
# Save data
data.to_csv('scriptie_data.csv', index=False)
control_data.to_csv('scriptie_control_data.csv', index=False)