# Exploring the Swedish Kelly List Dataset


September 26, 2022

## Setup

Install required libraries and restart the kernel.

In [None]:
%pip install bokeh datasets pandas rich

## Load the dataset

In [2]:
from bokeh.io import output_notebook, show
from bokeh.models import BooleanFilter, CDSView, ColumnDataSource, Range1d
from bokeh.plotting import figure
from bokeh.transform import factor_cmap
from datasets import load_dataset
import pandas as pd
from rich import pretty, print

pretty.install()
output_notebook()

dataset = load_dataset("codesue/kelly", split="train", revision="2137d4b378715475fb63be6fee0258992c20388e")

print(dataset.info.description)
print("The dataset has the following features:")
print(dataset.info.features)
print(f"You can learn more about this dataset at {dataset.info.homepage}.")

Using custom data configuration default
Reusing dataset kelly (/Users/sue/.cache/huggingface/datasets/codesue___kelly/default/1.0.1/d656304bc4c94b49f809895afb0dd4dd50c57abd4d2a8d367e2b16877d1564ca)


## Explore the dataset

In [3]:
df = pd.DataFrame.from_dict(dataset)
df["lemma_length"] = df.lemma.map(lambda x: len(x))
df["num_words"] = df.lemma.map(lambda x: len(x.split()))

In [4]:
source = ColumnDataSource(df)
p = figure(height=500, width=900)
p.toolbar.autohide = True
p.grid.grid_line_color = None
p.xaxis.axis_label = 'Lemma ID'
p.yaxis.axis_label = 'Lemma Length'
p.circle(source=source, x="id", y="lemma_length", color="black")
show(p)

### Variation by CEFR level

In [5]:
cefr_levels = sorted(list(set(df["cefr_level"].values)))
palette = ("#35193e", "#701f57", "#ad1759", "#e13342", "#f37651", "#f6b48f")
colors = factor_cmap("cefr_level", palette, cefr_levels)
p.circle(source=source, x="id", y="lemma_length", color=colors, legend_field="cefr_level")
p.legend.title = "CEFR Level"
p.legend.orientation = "horizontal"
p.legend.location = "top_right"
show(p)

Number of lemmas per level:

In [6]:
df["cefr_level"].value_counts().sort_index()

Aggregate statistics:

In [7]:
df.groupby("cefr_level")["lemma_length"].agg(["min", "max", "mean", "median", "std"])

Unnamed: 0_level_0,min,max,mean,median,std
cefr_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A1,1,38,6.438034,5.0,4.345356
A2,2,37,7.241453,7.0,3.384079
B1,2,36,7.645299,7.0,3.05163
B2,2,29,7.931624,8.0,3.190264
C1,2,20,7.697293,7.0,2.682318
C2,2,24,7.96726,8.0,2.797345


### Sampling Lemmas Longer Than 12 Letters

In [8]:
boundary_length = 12
is_long_lemma = df["lemma"].str.len() > boundary_length
long_lemmas_by_level = lambda level: df[is_long_lemma & (df["cefr_level"] == level)]

Number of lemmas per level:

In [9]:
df[is_long_lemma]["cefr_level"].value_counts().sort_index()

A1 lemmas longer than 12 letters:

In [10]:
long_lemmas_by_level("A1").head(25).head(25)

Unnamed: 0,id,raw_frequency,relative_frequency,cefr_level,source,marker,lemma,pos,examples,lemma_length,num_words
25,26,,,A1,manual,en,inrikesminister,noun-en,,15,1
26,27,,,A1,manual,,inrikespolitik,noun-en,,14,1
60,61,,,A1,manual,,Storbritannien,proper name,,14,1
77,78,,,A1,manual,en,utbildningsminister,noun-en,,19,1
86,87,2966316.0,26019.68,A1,SweWaC,,och (vardagl. å; förk. o.),conj,,26,5
87,88,2624032.0,23017.26,A1,SweWaC,att,vara (vardagl. va),verb,e.g. var så god!,18,3
102,103,1034410.0,9073.55,A1,SweWaC,,"inte (formellt: icke, ej)",adverb,,25,4
114,115,505860.0,4437.26,A1,SweWaC,,de (vardagl. dom),det,,17,3
118,119,395479.0,3469.03,A1,SweWaC,,sig (vardagl. sej),pronoun,,18,3
120,121,381431.0,3345.8,A1,SweWaC,,de (vardagl. dom),pronoun,,17,3


C2 lemmas longer than 12 letters:

In [11]:
long_lemmas_by_level("C2").head(25)

Unnamed: 0,id,raw_frequency,relative_frequency,cefr_level,source,marker,lemma,pos,examples,lemma_length,num_words
7053,7054,,3.33,C2,T2,en,funktionalitet,noun-en,,14,1
7065,7066,,3.31,C2,T2,,revolutionerande,adjective,,16,1
7079,7080,,3.27,C2,T2,en,specifikation,noun-en,,13,1
7084,7085,,3.25,C2,T2,en,internationalisering,noun-en,,20,1
7146,7147,,3.11,C2,T2,en,specialisering,noun-en,,14,1
7154,7155,,3.09,C2,T2,en,styrelseordförande,noun-en,,18,1
7160,7161,,3.08,C2,T2,en,telekommunikation,noun-en,,17,1
7161,7162,,3.07,C2,T2,,karakteristisk,adjective,,14,1
7162,7163,,3.07,C2,T2,en,åklagarmyndighet,noun-en,,16,1
7181,7182,,3.03,C2,T2,en,samstämmighet,noun-en,,13,1


### Variation by Number of Words per Lemma

In [12]:
p.circle(source=source, x="id", y="lemma_length", width="num_words", color=colors, legend_field="cefr_level")
max_length = df.lemma_length.max()
p.y_range = Range1d(boundary_length, (max_length + 5 - (max_length % 5)))
show(p)

Aggregate statistics:

In [13]:
df[is_long_lemma].groupby("cefr_level")["num_words"].agg(["min", "max", "mean", "median", "std"])

Unnamed: 0_level_0,min,max,mean,median,std
cefr_level,Unnamed: 1_level_1,Unnamed: 2_level_1,Unnamed: 3_level_1,Unnamed: 4_level_1,Unnamed: 5_level_1
A1,1,9,3.430556,3.0,1.599528
A2,1,6,2.274194,1.5,1.439141
B1,1,4,1.546667,1.0,0.934234
B2,1,6,1.346154,1.0,0.921894
C1,1,4,1.169014,1.0,0.632137
C2,1,4,1.081081,1.0,0.489807


Filtering out multiword lemmas:

In [14]:
p.renderers = []
is_one_word_lemma = df["num_words"] == 1
view = CDSView(source=source, filters=[BooleanFilter(is_one_word_lemma)])
p.circle(source=source, x="id", y="lemma_length", width="num_words", color=colors, legend_field="cefr_level", view=view)
max_length = df.lemma_length.max()
p.y_range = Range1d(boundary_length, (max_length + 5 - (max_length % 5)))
show(p)

Single-word A1 lemmas longer than 12 letters:

In [15]:
df[is_long_lemma & is_one_word_lemma & (df.cefr_level == "A1")]

Unnamed: 0,id,raw_frequency,relative_frequency,cefr_level,source,marker,lemma,pos,examples,lemma_length,num_words
25,26,,,A1,manual,en,inrikesminister,noun-en,,15,1
26,27,,,A1,manual,,inrikespolitik,noun-en,,14,1
60,61,,,A1,manual,,Storbritannien,proper name,,14,1
77,78,,,A1,manual,en,utbildningsminister,noun-en,,19,1
479,480,25384.0,222.66,A1,SweWaC,,internationell,adjective,,14,1
727,728,15700.0,137.72,A1,SweWaC,en,förutsättning,noun-en,,13,1
1051,1052,,90.73,A1,T2,,grundläggande,adjective,,13,1
1139,1140,9484.0,83.19,A1,SweWaC,,antingen…eller,conj,,14,1
1345,1346,7668.0,67.26,A1,SweWaC,,arbetsmarknad,noun-en,e.g. arbetsmarknaden,13,1


### Counts of words longer than six letters

Overall:

In [16]:
df[(df["lemma_length"] > 6)]["cefr_level"].value_counts().sort_index()

Single-word lemmas:

In [17]:
df[(df["lemma_length"] > 6) & is_one_word_lemma]["cefr_level"].value_counts().sort_index()

In [18]:
df[(df["lemma_length"] > 6) & is_one_word_lemma]["cefr_level"].value_counts().sort_index() / df["cefr_level"].value_counts().sort_index() 

NB: output dataframes to HTML with `df.to_html("filename.html",columns=("id", "cefr_level", "lemma", "lemma_length"), index=False)`