In [None]:
!pip install python-Levenshtein

In [None]:
import pandas as pd
import re
from Levenshtein import distance
from os.path import join

SEED = 1

# Read

In [None]:
from src.static.settings import SID_DATA_BASE
df = pd.read_csv(join(SID_DATA_BASE, 'kurse-beschreibungen.csv'))

In [None]:
df = df.drop_duplicates(subset='Name')
df.head()

In [None]:
df.describe()

# Only consider those with description

In [None]:
df = df[~df['Beschreibung'].isna()]

In [None]:
df.loc[:, 'desc_len'] = [len(i) for i in df['Beschreibung']]

## How long are the descriptions?

In [None]:
minlen, maxlen = max(0,df['desc_len'].mean()-3*df['desc_len'].std()), df['desc_len'].mean()+3*df['desc_len'].std()
tmp = df[(df['desc_len'] > minlen) & (df['desc_len'] < maxlen)]
tmp['desc_len'].hist();

In [None]:
df[df['desc_len'] < 500].hist();

### Sample-Descriptions: short & long

In [None]:
[i for i in df[df['desc_len'] < 100]['Beschreibung'].head(30)]

In [None]:
[i for i in df[df['desc_len'] > 3000]['Beschreibung'].head(2)]

## Select those with at least *some* letters

In [None]:
df = df[df['desc_len'] > 10]
df = df.drop(columns='desc_len')
len(df)

# Looking at VeranstaltungsNummer

In [None]:
dups = df[df.duplicated(subset='VeranstaltungsNummer', keep=False)]
zehntel = int(len(dups)/10)
start_indices = [10, zehntel, 3*zehntel, 5*zehntel, 7*zehntel]
for i in start_indices:
    with pd.option_context('display.max_rows', 101, 'display.max_columns', 20, 'display.expand_frame_repr', False, 'display.max_colwidth', 120): 
        display(dups.sort_values('VeranstaltungsNummer')[i:][['VeranstaltungsNummer', 'Name']].head(8))

**Result**: Kinda non-conclusive. Sometimes the same Veranstaltungsnummer means it's ab duplicate, sometimes it doesn't.

In [None]:
# replace parantheses in titles
df['Name'] = df['Name'].str.replace(re.compile(r'\([^)]*\)'), '')
print(len(df))
df = df.drop_duplicates(subset='Name')
print(len(df))

In [None]:
with pd.option_context('display.max_rows', 50, 'display.max_columns', 20, 'display.expand_frame_repr', False, 'display.max_colwidth', 5000): 
    display(df.sample(100, random_state=SEED))

## Absolute lower bound 

In [None]:
tmp = df.drop_duplicates(subset='VeranstaltungsNummer')
tmp.loc[:, 'desc_len'] = [len(i) for i in tmp['Beschreibung']]
tmp = tmp[tmp['desc_len'] > 200]

#### Number of VLs by first letter of VeranstaltungsNummer

In [None]:
lst = [(i[1]['VeranstaltungsNummer'][0], i[1]['Name'], ind) for ind, i in enumerate(tmp.iterrows())]
res_dict = {key: [] for key in [i[0] for i in lst]}
for key, val, nr in lst:
    res_dict[key].append((val, nr))

In [None]:
def do_it():
    num_things = 0
    all_dups = {}
    for key, val in res_dict.items():
        dups = set()
        if len(val) > 1:
            for numfirst, (first, numfirst) in enumerate(val):
                for (second, numsecond) in val[numfirst+1:]:
                    if distance(first, second) < 5:
                        if num_things < 20:
                            print(numsecond, '  ', first, '  |  ',second)
                            num_things += 1
                        dups.add(numsecond)
        all_dups[key] = dups
    return all_dups
                    
dups = do_it()
print(dups)

In [None]:
alls = []
for key, val in dups.items():
    alls.extend(val)

print(len(tmp) - len(alls))

## Languages

TODO: figure out if I make this plot after the correct amount of preprocessing/throwout

In [None]:
from tqdm import tqdm
from langdetect import detect
from langdetect.lang_detect_exception import LangDetectException
from collections import defaultdict
from matplotlib import pyplot as plt

In [None]:
di = df.set_index("Name")["Beschreibung"].to_dict()

languages = defaultdict(lambda : 0)
for name, desc in tqdm(di.items()):
    try:
        languages[detect(desc)] += 1
    except LangDetectException as e:
        languages["unk"] += 1
        
languages = dict(languages)
languages = dict(sorted(languages.items(), key=lambda x:x[1], reverse=True))
languages

In [None]:
display = ["de", "en"]
disp_lan = {k:v for k,v in languages.items() if k in display}
disp_lan["other"] = sum(v for k,v in languages.items() if k not in display)
#disp_lan["unknown"] = disp_lan.pop("unk")
disp_lan["german"] = disp_lan.pop("de")
disp_lan["english"] = disp_lan.pop("en")

print(disp_lan)
print(sum(disp_lan.values()))

In [None]:
import plotly.graph_objs as go
from plotly import tools


chart = go.Pie(labels=list(disp_lan.keys()), 
               values=list(disp_lan.values()), 
               marker=dict(line=dict(color='#FFF', width=2)),
               domain={'x': [0.0, 1], 'y': [0.0, 1]}, 
               showlegend=False, 
               name='Language Distribution', 
               textinfo='label+value+percent')

#layout = go.Layout(height = 600, width = 1000, autosize = False,
#                   title = 'Language Distribution for the dataset-descriptions')

layout = go.Layout(autosize=True)

fig = go.Figure(data =[chart], layout = layout)
fig.update_traces(textposition='outside')
fig.update_layout(uniformtext_minsize=16, uniformtext_mode='hide')

fig.show()

## Fachbereiche

In [None]:
from fb_classifier.preprocess_data import make_classifier_class
from src.fb_classifier.util.load_data import load_data
import os
from src.static.settings import SID_DATA_BASE

data = load_data({"all": os.path.join(SID_DATA_BASE, "kurse-beschreibungen.csv")})
make_classifier_class("all", data["all"], save_plot="./faculty_plot.png");

# Results

* Estimated absolute lower bound for useful entries: ~5.8k
* More likely ~21k useful entries
* The Schokeard-Paper uses 14k, 1.3k, 11k, 3.7k Datasets

--> Seems possible.

# After-notes

**Achtung** bspw die Sprachenzentrum-Kurse haben alle die gleiche Beshcreibung obwohl's komplett verschiedene sind