In [2]:
import nltk
import pandas as pd
import os

The Italian corpus is from this source: https://www.corpusitaliano.it/en/contents/description.html

It is the corpus that contains 220 million words from online material. 

They provide: 
1. the corpus
2. the annotated corpus
3. the occurrence counts for all the element
4. the occurrence counts for the elements without numbers etc



In [3]:
basedir="/home/tlei/Desktop/Italian_Corpus"

In [8]:
# load the occurence file, get the frequence profile
IT_lemma_freq=pd.read_csv(os.path.join(basedir,'lemma-WITHOUTnumberssymbols-frequencies-paisa.txt'), sep=",", header=1, names=["lemma","occurrence"])
total_lemma_counts=IT_lemma_freq['occurrence'].sum()
IT_lemma_freq["lemma"]=IT_lemma_freq.lemma.astype(str)
IT_lemma_freq["freq"]=IT_lemma_freq.occurrence.apply(lambda x: round(10e6*x/total_lemma_counts,2)).astype(int)



In [9]:
# apply the criteria to the words to get the lists
# word length 3,4,5,6,7 
word_length=IT_lemma_freq['lemma'].apply(lambda x: (len(x) >= 4) and len(x)<=7)
# filter the high and low frequency word from the list
# standard high freq occurrence> 1000 
high_freq=IT_lemma_freq['freq'].apply(lambda x: x>1000)
# standard low freq  occuance> 1  occurrence <39
low_freq=IT_lemma_freq['freq'].apply(lambda x: x<39 and x>1)


In [25]:
IT_high_freq=IT_lemma_freq[word_length & high_freq]
IT_low_freq=IT_lemma_freq[word_length & low_freq]


In [None]:
# load the annotated corpus to get the linguistic info

In [12]:
# Some checks
# count how many words are in the corpus
# Define a function to count words
def count_words_in_file(file_path):
    with open(file_path, 'r', encoding='utf-8') as file:
        text = file.read()  # Read the content of the file
        words = text.split()  # Split the text into words
        word_count = len(words)  # Count the number of words
    return word_count

# Specify the path to your corpus file
file_path = '/Users/tiger/italian_stim/corpus/paisa.raw.utf8'

# Count words
word_count = count_words_in_file(file_path)
print(f'The file contains {word_count} words.')

## not vary a lot, we will use the occurrence

The file contains 225292817 words.


In [26]:
IT_high_freq = IT_high_freq.rename(columns={'occurance': 'freq_per_mil'})
IT_high_freq['freq_per_mil']=IT_high_freq['freq_per_mil'].apply(lambda x: round(10e6*x/total_lemma_counts,2))
IT_low_freq = IT_low_freq.rename(columns={'occurance': 'freq_per_mil'})
IT_low_freq['freq_per_mil']=IT_low_freq['freq_per_mil'].apply(lambda x: round(10e6*x/total_lemma_counts,2))




In [1]:
# make Italian check list, to have the lexical infomation of each words so that we can select


def parse_paisa_corpus(file_path):
    # Define the columns based on the provided description
    columns = [
        'ID', 'FORM', 'LEMMA', 'CPOSTAG', 'POSTAG', 
        'FEATS', 'HEAD', 'DEPREL', 'unused_1', 'unused_2'
    ]
    
    # Initialize an empty list to store rows of the DataFrame
    data = []
    
    # Read the file and parse it
    with open(file_path, 'r', encoding='utf-8') as file:
        for line in file:
            # Skip comment lines or empty lines
            if line.startswith('#') or line.strip() == "":
                continue
                
            # Split the line into fields based on whitespace
            fields = line.split('\t')
            
            # Make sure the line has 10 fields (to match the structure)
            if len(fields) == 10:
                # Append only relevant columns and drop 'unused_1' and 'unused_2'
                data.append(fields[:8])  # We take only the first 8 fields
                
    # Create a DataFrame from the parsed data
    df = pd.DataFrame(data, columns=columns[:8])  # Use only relevant column names

    return df

# Example usage
file_path =  '/home/tlei/Desktop/paisa.annotated.CoNLL.utf8'
df = parse_paisa_corpus(file_path)

# Display the DataFrame
print(df)

          ID       FORM      LEMMA CPOSTAG POSTAG                    FEATS  \
0          1         La         il       R     RD              num=s|gen=f   
1          2      Siria      Siria       S     SP                        _   
2          3          è     essere       V      V  num=s|per=3|mod=i|ten=p   
3          4         un         un       R     RI              num=s|gen=m   
4          5  obiettivo  obiettivo       S      S              num=s|gen=m   
...       ..        ...        ...     ...    ...                      ...   
22103828  11         il         il       R     RD              num=s|gen=m   
22103829  12          6          6       N      N                        _   
22103830  13     agosto     agosto       S      S              num=s|gen=m   
22103831  14       2010       2010       N      N                        _   
22103832  15          .          .       F     FS                        _   

         HEAD    DEPREL  
0           2       det  
1          