<a href="https://colab.research.google.com/github/detsutut/infoquality-dca/blob/master/info_quality.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [None]:
!pip install pyphen

In [None]:
import nltk
import string
import re
import pyphen
import os
import pandas as pd
nltk.download('punkt')
dic_ita = pyphen.Pyphen(lang='it_IT')
dic_eng = pyphen.Pyphen(lang='en_EN')

In [None]:
def split_sentences(text: str):
    text = remove_URL(text)
    text = text.replace("\n\n",". ")
    text = text.replace("\n",". ")
    sents = nltk.tokenize.sent_tokenize(text)
    return sents 

def remove_punct(text: str):
    return re.sub('\W+\s*', ' ', text)

def remove_URL(sample):
    """Remove URLs from a sample string"""
    return re.sub(r"http\S+", "", sample)

def clean_text(text: str):
    text = remove_URL(text)
    text = text.replace("\d", "")
    text = text.replace("\n", " ")
    text = remove_punct(text)
    text = text.replace("[1-9 ]{1,}", " ")
    return text


def split_syllab(word: str, dic: pyphen.Pyphen):
    return dic.inserted(word).split("-")

def get_long_words(text:str):
    return [word for word in text.split() if len(word)>6]

def get_RIX(text: str):
    sentences = split_sentences(text)
    long_words = get_long_words(clean_text(text))
    return len(long_words)/len(sentences)

def get_syllab_score(text: str, dic: pyphen.Pyphen):
    words = text.split()
    syll_w = [len(split_syllab(word, dic)) for word in words]
    syll_s = sum(syll_w)
    return syll_s / len(words)

def get_gunning_fog(text: str, dic: pyphen.Pyphen):
    text_clean = clean_text(text)
    words = text_clean.split()
    sentences = split_sentences(text)
    complex_words = [word for word in words if len(split_syllab(word,dic))>=3]
    W = len(words)
    S = len(sentences)
    D = len(complex_words)
    return round(0.4*((W/S)+100*(D/W)),1)

def get_gulpease(text: str, dic: pyphen.Pyphen):
    text_clean = clean_text(text)
    words = text_clean.split()
    sentences = split_sentences(text)
    LP = len(text_clean)*100/len(words)
    FR = len(sentences)*100/len(words)
    return round(89-(LP/10)+(FR*3),1)

def get_flesch(text: str, dic: pyphen.Pyphen, type="ENG"):
    text_clean = clean_text(text)
    words = text_clean.split()
    sentences = split_sentences(text)
    total_num_words = len(words)
    total_num_sentences = len(sentences)
    W = total_num_words/total_num_sentences # numero medio di parole per frase
    S = get_syllab_score(text_clean, dic) 
    if type=="ITA72":
      index = 206-(0.65*W)-(100*S)
    elif type=="ITA86":
      index = 217-(1.3*W)-(61.8*S)
    else:
      index = 206.835-(84.6*S)-(1.015*W)
    return {"words":total_num_words, "sentences": total_num_sentences, "W%": W,"S%":S,"index": round(index,1)}


In [None]:
from google.colab import drive
drive.mount('/content/gdrive')

Google



In [None]:
path = "/content/gdrive/MyDrive/Your/Path/Here/Corpus/10Googles"
dirs = [s.name for s in os.scandir(path) if s.is_dir()]
for dir in dirs:
  lan = "en" #ita en
  path_of_the_directory = f"{path}/{dir}/{lan}/"
  rows = []
  for filename in os.listdir(path_of_the_directory):
      f = os.path.join(path_of_the_directory,filename)
      if os.path.isfile(f):
          f = open(f, 'r')
          text = f.read()
          fv = get_flesch(text, dic_eng, type="ENG") #remember to change dic as well when changing language
          rows.append({'name':filename,
                      'words':fv['words'],
                      'sentences':fv['sentences'],
                      'words per sent':round(fv['W%'],2),
                      'syllabs per word':round(fv['S%'],2),
                      'RIX': round(get_RIX(text),2),
                      'flesch':fv['index'],
                      'flesch_vacca_72':fv['index'],
                      'flesch_vacca_86':get_flesch(text, dic_ita, type="ITA86")['index'],
                      })
          f.close()
  df = pd.DataFrame(rows)
  df.to_csv(path_or_buf=f"{path}/{dir}_index_{lan}.csv",sep=";")

Wiki

In [None]:
path = "/content/gdrive/MyDrive/Your/Path/Here/Corpus/Wikipedias"
lan = "en" #ita en
path_of_the_directory = f"{path}/{lan}/"
rows = []
for filename in os.listdir(path_of_the_directory):
    f = os.path.join(path_of_the_directory,filename)
    if os.path.isfile(f):
        f = open(f, 'r')
        text = f.read()
        fv = get_flesch(text, dic_eng, type="ENG") #remember to change dic as well when changing language
        rows.append({'name':filename,
                    'words':fv['words'],
                    'sentences':fv['sentences'],
                    'words per sent':round(fv['W%'],2),
                    'syllabs per word':round(fv['S%'],2),
                    'RIX': round(get_RIX(text),2),
                    'flesch':fv['index'],
                    'flesch_vacca_72':fv['index'],
                    'flesch_vacca_86':get_flesch(text, dic_ita, type="ITA86")['index'],
                    })
        f.close()
df = pd.DataFrame(rows)
df.to_csv(path_or_buf=f"{path}/index_{lan}.csv",sep=";")