In [31]:
import pandas as pd
import os
from zipfile import ZipFile
from bs4 import BeautifulSoup
from sklearn.model_selection import train_test_split
from nltk.stem import PorterStemmer
stemmer = PorterStemmer()
import re
import nltk
from nltk.corpus import stopwords
import fasttext 
from sklearn.feature_extraction.text import TfidfVectorizer,CountVectorizer
from gensim.utils import simple_preprocess

# Analysis on data dump
This notebook is to learn reading data dumps directly form the server (Rnd2) as opposed to obtain the data through API calls

Recent data dump with normalized lang tags produced at the end of June 2022 for the project Etranslate is used in this notebook

# Data collection

In [7]:
#data dump Europeana June 2022 location on the Rnd2 server
data_path="/projects/etranslate-data-dump"

In [8]:
!pwd

/home/pscalia/git/rd-europeana-translate/data_analysis/from_datadump


In [9]:
#listing files in directory
list_data_files=os.listdir(data_path)

In [10]:
len(list_data_files)

2089

# Experiment with zip files

In [69]:
#locating zip file. Each zip file  contains a collection
file_name = os.path.join(data_path,'02301.zip')
file_name

'/projects/etranslate-data-dump/02301.zip'

In [70]:
with ZipFile(file_name, 'r') as zipObject:
     listOfFileNames = zipObject.namelist()

In [None]:
# extracting all files that have not been extracted within a given zip file
with ZipFile(file_name, 'r') as zipObject:
 listOfFileNames = zipObject.namelist()
 for fileName in listOfFileNames:
        if os.path.isfile(fileName):
            print('file already exist')
        else:
            zipObject.extract(fileName)
            print('file extracted')

In [72]:
listOfFileNames[0]

'urn_imss_biography_020216.xml'

In [39]:
dat='00180020C7AF376F0C82A5F47CAD7BED272DF62A.xml'

In [73]:
#open one of the xml unzipped files
with open(listOfFileNames[0], 'r') as f:
	file = f.read() 

In [74]:
#use beaut soup for parsing xml
soup = BeautifulSoup(file, 'xml')

In [79]:
#finding all description tag
t=soup.find('description')

In [80]:
t.text

"After a humanist education in fifteenth-century Florence, Amerigo Vespucci engaged in commercial and financial ventures. Sent to Seville by Lorenzo di Pierfrancesco de' Medici, he settled there and began to work with the Spanish and Portuguese travelers who, in the wake of Christopher Columbus, [...]"

In [55]:
t.get('xml:lang')

'en'

In [None]:
# find attributes of title
soup.find()

In [None]:
df=pd.DataFrame([t.text], columns=["text"])

In [None]:
df.text[0]

# Read all files from a given collection and build clustering

This is built upon https://www.jeansnyman.com/posts/unsupervised-text-clustering-with-k-means/ The idea is to look at ways to measure similarities among texts. The goal is to estimate similar texts to Europeana in order to find texts in low represented languages that can be used for training. 

In [15]:
# Lets do some text cleanup
REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|@,;]')
BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
REMOVE_NUM = re.compile('[\d+]')
STOPWORDS = set(stopwords.words('english'))

def clean_text(text):
    """
    text: a string
    return: modified initial string
    """
    # lowercase text
    text = text.lower() 

    # replace REPLACE_BY_SPACE_RE symbols by space in text
    text = REPLACE_BY_SPACE_RE.sub(' ', text) 
    
    # Remove the XXXX values
    text = text.replace('x', '') 
    
    # Remove white space
    text = REMOVE_NUM.sub('', text)

    #  delete symbols which are in BAD_SYMBOLS_RE from text
    text = BAD_SYMBOLS_RE.sub('', text) 

    # delete stopwords from text
    text = ' '.join(word for word in text.split() if word not in STOPWORDS) 
    
    # removes any words composed of less than 2 or more than 21 letters
    text = ' '.join(word for word in text.split() if (len(word) >= 2 and len(word) <= 21))

    # Stemming the words
    text = ' '.join([stemmer.stem(word) for word in text.split()])
    
    return text

In [14]:
def read_single_collection(listOfFileNames, field):
    descriptions=[]
    for filename in listOfFileNames:
        with open(filename, 'r') as f:
            file = f.read() 
        soup = BeautifulSoup(file, 'xml')
        t=soup.find(f'{field}')
        if t:
            if t.get('xml:lang')=='en':
                descriptions.append(t.text)
    return descriptions

In [11]:
des=read_single_collection(listOfFileNames, 'description')

NameError: name 'listOfFileNames' is not defined

In [12]:
des

NameError: name 'des' is not defined

In [None]:
df=pd.DataFrame(des, columns=["descriptions"])

In [13]:
df.tail(2)

NameError: name 'df' is not defined

In [None]:
df["descriptions"] =df["descriptions"].apply(clean_text)

In [None]:
vectorizer = TfidfVectorizer(sublinear_tf= True, min_df=10, norm='l2', ngram_range=(1, 2), stop_words='english')
X_train_vc = vectorizer.fit_transform(df["descriptions"])

pd.DataFrame(X_train_vc.toarray(), columns=vectorizer.get_feature_names_out()).head()

In [None]:
k_clusters = 1
from matplotlib import pyplot as plt

In [None]:
from sklearn.cluster import KMeans

score = []
for i in range(1,k_clusters + 1):
    kmeans = KMeans(n_clusters=i,init='k-means++',max_iter=800,n_init=5,random_state=0)
    kmeans.fit(X_train_vc)
    score.append(kmeans.inertia_)
plt.plot(range(1,k_clusters + 1 ),score)
plt.title('The Elbow Method')
plt.xlabel('Number of clusters')
plt.ylabel('Score')
plt.savefig('elbow.png')
plt.show()

In [None]:
k_clusters = 1

model = KMeans(n_clusters=k_clusters, init='k-means++', n_init=10, max_iter=1000, tol=0.001, random_state=0)
model.fit(X_train_vc)

In [None]:
clusters = model.predict(X_train_vc)

In [None]:
df["ClusterName"] = clusters
# Convert the label (Product) to numeric using the pd factorize function 
df.head(20)

In [None]:
plt.scatter(X[y_kmeans==0, 0], X[y_kmeans==0, 1], s=100, c='red', label ='Cluster 1')

In [None]:
from sklearn.decomposition import PCA
sklearn_pca = PCA(n_components = 2)
Y_sklearn = sklearn_pca.fit_transform(X_train_vc.toarray())
kmeans = KMeans(n_clusters=k_clusters, max_iter=600, algorithm = 'lloyd')
fitted = kmeans.fit(Y_sklearn)
prediction = kmeans.predict(Y_sklearn)

plt.figure(figsize=(12, 6))
plt.scatter(Y_sklearn[:, 0], Y_sklearn[:, 1], c=prediction, s=40, cmap='viridis', linewidths=5)

centers = fitted.cluster_centers_
plt.scatter(centers[:, 0], centers[:, 1],c='black', s=200, alpha=0.6);

In [None]:
import numpy as np
import seaborn as sns

In [None]:
def get_top_features_cluster(tf_idf_array, prediction, n_feats):
    labels = np.unique(prediction)
    dfs = []
    for label in labels:
        id_temp = np.where(prediction==label) # indices for each cluster
        x_means = np.mean(tf_idf_array[id_temp], axis = 0) # returns average score across cluster
        sorted_means = np.argsort(x_means)[::-1][:n_feats] # indices with top 20 scores
        features = vectorizer.get_feature_names_out()
        best_features = [(features[i], x_means[i]) for i in sorted_means]
        df = pd.DataFrame(best_features, columns = ['features', 'score'])
        dfs.append(df)
    return dfs


def plotWords(dfs, n_feats):
    for i in range(0, len(dfs)):
        plt.figure(figsize=(10, 5))
        plt.title(("Most Common Words in Cluster {}".format(i)), fontsize=10, fontweight='bold')
        sns.barplot(x = 'score' , y = 'features', orient = 'h' , data = dfs[i][:n_feats])

In [None]:
dfs = get_top_features_cluster(X_train_vc.toarray(), prediction, 20)
plotWords(dfs, 20)

In [None]:
cleaned_data = ["the instrument has a scale, dial brass and it is wooden",
                "use ",
                " "]
cleaned_data = pd.DataFrame(cleaned_data, columns=["descriptions"])
cleaned_data = cleaned_data["descriptions"].apply(clean_text)
predicted = model.predict(vectorizer.transform(cleaned_data))
predicted

The problem with is approach is that it does not quantify how well the sentence in the example matches its belonging to the cluster. It is probably related to the fact that it is treated as a unsupervised problem.
Idea to explore- use clustering in a supervised way using the name of the collections al cluster name! Would that work? Maybe even better would be using fasttext which is designed for supervised learning

# Experiment using fasttext

The idea is to use the fasttext classifier to classify all descriptions into labels. In this experiment we use the names of the datasets as label names. This is a supervised training alogorithm. If this works well we could then use new sentences from different datasets and measure their classification score into one of the labels. If the classification score is high enough we could add the sentence to the training datasets. <br>
Reference https://towardsdatascience.com/fasttext-for-text-classification-a4b38cbff27c

First we have to organize the descriptions in the classes given by the datasetname

In [11]:
def read_single_collection_(listOfFileNames, field,zip_):
    """ This function organizes metadata fields from a collection in a daframe
     Parameters
     listOfFileNames: list of file names with the given collection
     field: metadata field value to extract (only english values), typically title or description
     zip: reference Zipobject to be read
    (eg. a collection could be of the type 3204.zip) """
    descriptions=[]
    for filename in listOfFileNames:
        if os.path.isfile(filename)!= None:
            zip_.extract(filename)
            with open(filename, 'r') as f:
                file = f.read() 
                soup = BeautifulSoup(file, 'xml')
                t=soup.find(f'{field}')
                if t:
                    if t.get('xml:lang')=='en': #extracting field with english language tag
                        descriptions.append(t.text)
        os.remove(filename)  
    return descriptions

In [14]:
#list of datasets to be considered
list_datasets_analysis=['02301.zip','2058501.zip']

['02301.zip', '2058501.zip', '03706.zip', '07101.zip']

Now we read the files from the collections, and categorize them according to their collection name

In [None]:
#This for loop creates a df for each collection where the first column represents the value of a metadatafield
# and the second column assigns it as label the name of the collection
list_of_dfs = []
for dataset in list_data_files:
    file_name = os.path.join(data_path, dataset)
    with ZipFile(file_name, 'r') as zipObject:
        listOfFileNames = zipObject.namelist()
        des=read_single_collection_(listOfFileNames, 'description',zipObject)
        df=pd.DataFrame(des, columns=["descriptions"])
        df['label']=f'__label__{dataset.split(".")[0]}'
        list_of_dfs.append(df)
list_of_dfs

In [17]:
# creating a single dataframe with all collections
df_fin=pd.concat(list_of_dfs, ignore_index=True, axis=0)
df_fin.head()

Unnamed: 0,descriptions,label
0,After a humanist education in fifteenth-centur...,__label__02301
1,Lavoisier began his legal studies at the insti...,__label__02301
2,French chemist and physicist. After attending ...,__label__02301
3,A pharmacist in Rouen and student of G.-F. Rou...,__label__02301
4,The last child of a family involved in pottery...,__label__02301


In [56]:
#using jensin to do NLP Preprocessing and prepare the data for fasttext
df_fin.iloc[:, 0] = df_fin.iloc[:, 0].apply(lambda x: ' '.join(simple_preprocess(x)))

In [33]:
df_fin.head()

Unnamed: 0,descriptions,label
0,after humanist education in fifteenth century ...,__label__02301
1,lavoisier began his legal studies at the insti...,__label__02301
2,french chemist and physicist after attending a...,__label__02301
3,pharmacist in rouen and student of rouelle des...,__label__02301
4,the last child of family involved in pottery s...,__label__02301


Splitting the dataframe randomly into training and test datasets

In [34]:
train, test = train_test_split(df_fin, test_size=0.3, random_state=1)

In [35]:
#saveing the train data to txt file
train.to_csv('train.txt', index=False, sep=' ', header=False, escapechar=" ")

In [36]:
#saveing the test data to txt file
test.to_csv('test.txt', index=False, sep=' ', header=False, escapechar=" ")

In [161]:
#training the model
model = fasttext.train_supervised('train.txt', wordNgrams = 2, epoch=5,lr=0.8)

Read 0M words
Number of words:  7247
Number of labels: 2
Progress: 100.0% words/sec/thread:  363255 lr:  0.000000 avg.loss:  0.050277 ETA:   0h 0m 0s


In [162]:
test.iloc[2,:]

descriptions    dagger shaped compass for geometrical surveyin...
label                                              __label__02301
Name: 547, dtype: object

In [163]:
test.iloc[18,:]

descriptions    mechanical device for the transmission and reg...
label                                              __label__02301
Name: 177, dtype: object

In [164]:
model.predict(test.iloc[18,0])  

(('__label__02301',), array([0.99950182]))

In [165]:
model.test('test.txt')                      

(614, 0.990228013029316, 0.990228013029316)

Now I test the category prediction of a random sentence taken not from Europeana and use the trained model
to assign it a label

In [166]:
#text copied by the new yorker
model.predict('what do you think')

(('__label__2058501',), array([1.00000989]))

Here I know that the sentence does not belong to Europeana but still the model assign it to an Europeana collection with high confidence - aboyt 86% - it probably means that at this stage we cannot use this system 
to judge if data is in domain

# Experiment: extract all german text

In [None]:
result = cs.calculate(df.iloc[0], df.iloc[1])

In [None]:
listOfFileNames

In [None]:
with ZipFile(file_name, 'r') as zipObject:
 listOfFileNames = zipObject.namelist()
 for fileName in listOfFileNames:
        if os.path.isfile(fileName):
           # Extract a single file from zip
            print('file already exist')
            print(fileName)
        else:
            zipObject.extract(fileName)
            print('file extracted') 

In [None]:
def read_single_collection(listOfFileNames, field):
    lang_tags=[]
    for filename in listOfFileNames:
        with open(filename, 'r') as f:
            file = f.read() 
        soup = BeautifulSoup(file, 'xml')
        t=soup.find(f'{field}')
        lang=t.get('xml:lang')
        if lang:
            lang_tags.append(lang)
    return lang_tags

In [None]:
list_data_files=os.listdir(data_path) #list of zip files
for zip_file in list_data_files:
    file_name = os.path.join(data_path,  zip_file) #name of zip file
    print(file_name)
    with ZipFile(file_name, 'r') as zipObject:
        listOfFileNames = zipObject.namelist()
        for fileName in listOfFileNames:
                if os.path.isfile(fileName):
                   # Extract a single file from zip
                   print('file exists')
                   os.remove(fileName)  
                else:
                    pass
         
                #     extracted_file=zipObject.extract(fileName)
                #     print('file extracted')
                #     with open( extracted_file, 'r') as f:
                #         file = f.read() 
                #     soup = BeautifulSoup(file, 'xml')
                #     t=soup.find('description')
                #     print(t.get('xml:lang'))
                #     print('file extracted')

In [None]:
for values in soup.findAll("time"):
    print("{} : {}, {}°".format(values["from"], values.find("symbol")["name"], values.fin

In [None]:
file_name = os.path.join(data_path,  '2051943.zip')
file_name

In [None]:
with open('plink__f_1_100475.xml', 'r') as f:
    file = f.read() 
soup = BeautifulSoup(file, 'xml')
t=soup.find('temporal')
#t.get('xml:lang')
t

In [None]:
file_name = os.path.join(data_path,  '2051943.zip')
file_name
    

In [None]:
file_name = os.path.join(data_path,  list_data_files[2])

In [None]:
with ZipFile(file_name, 'r') as zipObject:
    listOfFileNames = zipObject.namelist()
 for fileName in listOfFileNames:
        if os.path.isfile(fileName):
           # Extract a single file from zip
            print('file already exist')
        else:
            data=zipObject.extract(fileName)
            print('file extracted')

In [None]:
#alternative to find keyword in file?
keyword = 'your_keyword'
for file in files:
    if os.path.isfile(os.path.join(your_path, file)):
        f = open(os.path.join(your_path, file),'r')
        for x in f:
            if keyword in x:
                #do what you want
        f.close()