## DSA/DMA PRICING 

#### This code generates a price for every article in DSA(Digital Service Act) and DMA(Digital Market Act) regulations

### Cleaning the Data

In [1]:
import re
import nltk
import json
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
import spacy
import string
from textstat.textstat import textstat

In [2]:
def clean_text(data):
    #cleaning the data, lowering the cases, puctiation, stopwords
    # data is a string of text
    
    tokens = word_tokenize(data)
    tokens = [w.lower() for w in tokens]
    table = str.maketrans('', '', string.punctuation)
    stripped = [w.translate(table) for w in tokens]
    words = [word for word in stripped if word.isalpha()]
    stop_words = set(stopwords.words('english'))
    words = [w for w in words if not w in stop_words]
    
    #lemmatization with spacy
    #nlp = spacy.load('/Users/fzayguler/opt/anaconda3/envs/ghostwriter/lib/python3.9/site-packages/en_core_web_sm/en_core_web_sm-3.3.0')#alternative way of loading spacy
 
    nlp = spacy.load('en_core_web_sm')
    data = ' '.join(words)
    doc = nlp(data)
    cleanwords = [token.lemma_ for token in doc]
    
    return cleanwords

### Splitting by Pattern

In [3]:
#Splitting by pattern by using regex. In the original text Every article stats wirh a '(', a number and ')'. This function splits the text by these patterns.
def get_splits(txt):

    with open(('/Users/fzayguler/dsa/NLPRegulationsFinal/data/dsa.txt'), 'r') as file:
        txt = file.read()

    pattern = re.compile(r'\n?\([0-9]+\)\n')
    # res = re.findall(pattern, txt)
    splits = re.split(pattern, txt)
    return splits

### Google Ads Prices

In [4]:
#Google add prices also known as CPC(click per count) extracted with an api and saved as a json file
def get_google_ads_prices(path, features):
    
    with open(path, 'r') as f:
        data = json.load(f)
    
    return [{d['name']:d['CPC']} for d in data if d['name'] in features]

In [5]:
#Search the Google Analytics prices for each word and use a default 10 if the price doesn't exist
#I am using only monograms in the code. Monograms sometimes doesnt mean anything for the Google Ads. 
#The problem was Google Ads had trouble finding one word keywords because it is not common. I will try implementing the code for ngrams in the furure.
def parse_prices(GAprices, words):
    vals = [tuple(d.items()) for d in GAprices]
    gaPrices = []
    for w in words:
        for d in GAprices:
            p = d.get(w,None)
        if p:
            gaPrices.append(p)
        else:
            gaPrices.append(10)
    return gaPrices

### The Flesch–Reading-Ease

In [6]:
#The Flesch–Reading-Ease is a readability test designed to indicate how difficult a passage in English is to understand. https://en.wikipedia.org/wiki/Flesch%E2%80%93Kincaid_readability_tests#Flesch_reading_ease
#higher scores indicate material that is easier to read; lower numbers mark passages that are more difficult to read. 
#The scores are generally between 0-100 butthere is no limit on how low the score can be. A negative score is also valid. Since the sentences are quite long My scores was quite low and somethimes even negative.

def get_readability_metrics(split):

    reading_ease = textstat.flesch_reading_ease(split)
    mcalpine = textstat.mcalpine_eflaw(split)
    # textstat.reading_time(split, ms_per_char=14.69)
    return reading_ease, mcalpine

### TF/IDF
##### Calculate IDF by using every article as document

In [7]:
import numpy as np
import pandas as pd
# from sklearn.preprocessing import normalize
from sklearn.feature_extraction.text import TfidfVectorizer
import matplotlib.pyplot as plt

In [8]:
def get_IDF(vectorizer:TfidfVectorizer):
    vocab = vectorizer.vocabulary_
    idf = vectorizer.idf_
    #scaling the values to make the difference bigger
    return {key:2**(idf[val]) for key, val in vocab.items()}

In [9]:
#Just a helper to get an array of IDF values for each of the considered words
def get_idf_embedding(article, idf):
    words = article.split(" ")
    return np.array([idf.get(w,1) for w in words])

### Pricing Function

#### Price of article = sum (GoogleAddPrice (ngram) x if/idf(monogram)) + TextReadibility x cte 



In [10]:
#For now I am using GoogleAddPrice monogram but the code will be implemented with ngrams in the future
def get_price(cleaned_data, googleAdPrice, vectorizer: TfidfVectorizer):
    idf = get_IDF(vectorizer)
    prices = []
    for c in cleaned_data:
        # emb = vectorizer.transform([c]).toarray()
        words = c.split(" ")
        emb = get_idf_embedding(c, idf)
        # GAprices = np.random.random_integers(low=10, high=1000, size = emb.shape)
        GAprices = np.array(parse_prices(googleAdPrice, words))
        # print(f"Embedding Shape, {emb.shape}")
        # print(f"GAprices Shape, {GAprices.shape}")
        reading_ease = get_readability_metrics(c)
        aux = (emb*GAprices).sum()+ 10 * reading_ease
        prices.append(aux)
    return np.array(prices)

### Making the Calculations

In [13]:
# Get the Vectorizer
#Change the ngram_range=[1, 3] for trigrams
tfidf_vectorizer = TfidfVectorizer(ngram_range=[1, 1], use_idf=True)

In [14]:
# Split the document in articles
splits = get_splits("/Users/fzayguler/dsa/NLPRegulationsFinal/data/dsa.txt")

In [15]:
#Clean it using the clean function
cleaned_text = [clean_text(split) for split in splits]

In [16]:
# convert nested list into string for if/idf
cleaned_data = [' '.join([str(c) for c in lst]) for lst in cleaned_text]
# cleaned_text_concat = [j for i in cleaned_text for j in i]

In [17]:
#just to show how tfidf is used in a matrix. 
tfidf_separate = tfidf_vectorizer.fit_transform(cleaned_data)

df_tfidf2 = pd.DataFrame(tfidf_separate.toarray(
), columns=tfidf_vectorizer.get_feature_names())

print(df_tfidf2.head())

   ability  able  absence  abstain  abuse  abusive    access  accessibility  \
0      0.0   0.0      0.0      0.0    0.0      0.0  0.000000            0.0   
1      0.0   0.0      0.0      0.0    0.0      0.0  0.075114            0.0   
2      0.0   0.0      0.0      0.0    0.0      0.0  0.074636            0.0   
3      0.0   0.0      0.0      0.0    0.0      0.0  0.000000            0.0   
4      0.0   0.0      0.0      0.0    0.0      0.0  0.000000            0.0   

   accessible  accommodation  ...  withdraw  within   without  woman  work  \
0         0.0            0.0  ...       0.0     0.0  0.000000    0.0   0.0   
1         0.0            0.0  ...       0.0     0.0  0.000000    0.0   0.0   
2         0.0            0.0  ...       0.0     0.0  0.067689    0.0   0.0   
3         0.0            0.0  ...       0.0     0.0  0.000000    0.0   0.0   
4         0.0            0.0  ...       0.0     0.0  0.000000    0.0   0.0   

   working  would  xenophobic      year  yet  
0      0.



In [18]:
word_lst = tfidf_vectorizer.get_feature_names()
count_lst = tfidf_separate.toarray().sum(axis=0)

vocab_df = pd.DataFrame((zip(word_lst, count_lst)),
                        columns=["vocab", "tfidf_value"])



In [19]:
display(vocab_df)

Unnamed: 0,vocab,tfidf_value
0,ability,0.584717
1,able,1.853461
2,absence,0.393788
3,abstain,0.118115
4,abuse,0.243452
...,...,...
1341,working,0.089843
1342,would,0.183427
1343,xenophobic,0.081581
1344,year,0.514098


In [20]:
vocab_df.to_csv('tfidf_mono')

### Get the google ad prices and filter them with the features we've got

In [21]:
# Get the google ad prices and filter them with the features we've got
GAprices = get_google_ads_prices("/Users/fzayguler/dsa/NLPRegulations/data/dict_word_final.json", set(tfidf_vectorizer.get_feature_names()))



### Final Price Calculation


In [23]:
# Calculate the prices using the get_price functions and print as csv
prices = get_price(cleaned_data, GAprices, tfidf_vectorizer)

with open("data/GAQuery.json", "w") as f:
    data = [{"name":v} for v in tfidf_vectorizer.get_feature_names()]
    json.dump(data, f)

articles = [f"Article {i}" for i in range(len(prices))]

pricedf = pd.DataFrame(zip(articles, prices), columns=["Articles", "Prices"])

pricedf.to_csv("data/prices.csv")



In [24]:
# A plot of article prices
_, ax = plt.subplots(1, 1, figsize=(12, 8))
ax.plot(prices)
ax.set_xlabel("Article Number")
ax.set_ylabel("Price")
# ax.set_yscale("log")
plt.savefig("prices.jpg")
plt.close()