# Dataset Preparation

This notebook presents initial data selection for complex sentences. 



In [1]:
import matplotlib.pyplot as plt
import pandas as pd
from tqdm.notebook import tqdm
import seaborn as sns
import numpy as np
import json

In [2]:
def dump_jsonl(data, output_path, append=False, progress=False):
    """
    Write list of objects to a JSON lines file.
    """
    mode = 'a+' if append else 'w'
    with open(output_path, mode, encoding='utf-8') as f:
        if progress:
            data = tqdm(data)
            
        for line in data:
            json_record = json.dumps(line, ensure_ascii=False)
            f.write(json_record + '\n')
    print('Wrote {} records to {}'.format(len(data), output_path))
    
def load_jsonl(input_path, progress=False) -> list:
    """
    Read list of objects from a JSON lines file.
    """
    data = []
    with open(input_path, 'r', encoding='utf-8') as f:
        if progress:
            f = tqdm(f)
            
        for line in f:
                data.append(json.loads(line.rstrip('\n|\r')))
            
    print('Loaded {} records from {}'.format(len(data), input_path))
    return data

## Loading Data 

There are two text input files.
* general texts for TF-IDF computation
    * We will use this to calculate rare words using tf-idf
    * In this file you will have each row as sentence/paragraph etc 
    * Need to very large min 200K sentences
    
* target source where complex sentences will be selected.
    * We will use this to select the Complex sentence based on the different conditions
    * In this file you will have each row as sentence with option to have categories (i.e. from where senetence is selected)
    * Need to very large min 50K sentences




In [3]:
data_dir = r'D:\rarewordshindi\Textchunks'
generic_file = '\chunk_d.csv'
generic = pd.read_csv(data_dir+generic_file)

In [4]:
import nltk
import pandas as pd
from nltk.corpus import indian
from nltk.tag import tnt

# Make sure to download the required NLTK data
nltk.download('indian')

# Sample data: replace this with reading your actual CSV file using pd.read_csv('yourfile.csv')


# Function to extract nouns from a Hindi text string
def extract_nouns(text):
    tokens = text.split()
    train_data = indian.tagged_sents('hindi.pos')
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    tagged_words = tnt_pos_tagger.tag(tokens)
    nouns = [word for word, tag in tagged_words if tag.startswith('NN')]
    return nouns

# Applying the function to each row in the DataFrame's 'text' column
generic['nouns'] = generic['text'].apply(extract_nouns)

# Print the DataFrame with an additional column for nouns
print(generic)


[nltk_data] Downloading package indian to
[nltk_data]     C:\Users\DELL\AppData\Roaming\nltk_data...
[nltk_data]   Package indian is already up-to-date!


                                                   text  \
0        "आवेदन करने की आखिरी तारीख 31 जनवरी, 2020 है।"   
1     "इतनी दुआ कर दो हमारे लिए कि जितना प्यार दुनिय...   
2     "मोदी सरकार के पहले कार्यकाल में भी तीन तलाक क...   
3     "भाजपा के दिवंगत नेता प्रमोद महाजन की बेटी पून...   
4     "ऐसी स्थिति में एक न्यायपूर्ण सरकार सार्वजनिक ...   
...                                                 ...   
2029  "देश के सबसे पुराने राजनीतिक दल के अध्यक्ष से ...   
2030  "भारतीय हॉकी टीम के कप्तान मनप्रीत सिंह ने गोल...   
2031  "अगर ऐसा होता है तो दिशा इस शो में दो साल से अ...   
2032  "शिमला — कानपुर में खेली गई राष्ट्रीय स्कूल ता...   
2033                    "इसी दौरान पवन भी नहा  रहा था।"   

                                                  nouns  
0                                                    []  
1                                              [दुनिया]  
2                                          [सरकार, पास]  
3                                          [नेता, सचिव]  
4

In [58]:
generic.head()

Unnamed: 0,text,nouns,tokens
0,"""आवेदन करने की आखिरी तारीख 31 जनवरी, 2020 है।""",[],"[[""आवेदन, करने, की, आखिरी, तारीख, 31, जनवरी,, ..."
1,"""इतनी दुआ कर दो हमारे लिए कि जितना प्यार दुनिय...",['दुनिया'],"[[""इतनी, दुआ, कर, दो, हमारे, लिए, कि, जितना, प..."
2,"""मोदी सरकार के पहले कार्यकाल में भी तीन तलाक क...","['सरकार', 'पास']","[[""मोदी, सरकार, के, पहले, कार्यकाल, में, भी, त..."
3,"""भाजपा के दिवंगत नेता प्रमोद महाजन की बेटी पून...","['नेता', 'सचिव']","[[""भाजपा, के, दिवंगत, नेता, प्रमोद, महाजन, की,..."
4,"""ऐसी स्थिति में एक न्यायपूर्ण सरकार सार्वजनिक ...","['स्थिति', 'सरकार', 'रूप']","[[""ऐसी, स्थिति, में, एक, न्यायपूर्ण, सरकार, सा..."


In [81]:
generic.loc[0,'text']

'"आवेदन करने की आखिरी तारीख 31 जनवरी, 2020 है।"'

In [25]:
! pip install indic-nlp-library




In [6]:

def split_sentence(text, lang="hi"):
    #If there is other way to split the sentence in your language please use option and change
    
    # Remove newline, tabs. 
    # You could add more things if needed
    text = text.replace("\n"," ")
    text = text.replace("\t", " ")
        
    #You need to include your language Senetence Splitter
    if lang == "hi": 
        text = text.lower()
        sents = text.split('|') # could also split by '?' etc 
    elif lang == 'th':
        from pythainlp.tokenize import sent_tokenize
        sents = sent_tokenize(text)
    return sents

def word_tokenize(text, lang="hi"):
    #If there is other way to tokenize the sentence in your language please use option and change
    #You need to include your language Word Tokenizer
    if lang == "hi": 
        words = text.split(' ') #Lematize
    elif lang == 'th':
        from pythainlp.tokenize import word_tokenize
        words = word_tokenize(text)
    return words


def tokenize_sentence(text, lang="hi"):
    sents = split_sentence(text, lang=lang)
    words = []
    for s in sents:
        w = word_tokenize(s, lang=lang)
        words.append(w)
    return words





In [7]:
def add_tokens(df):
    tokens = []
    for idx, row in tqdm(df.iterrows(), total=len(df)):
        words = tokenize_sentence(row["text"])
        tokens.append(words)
    df["tokens"] = tokens
    
    return df

In [8]:
generic = generic.astype(str)

In [9]:
generic = add_tokens(generic)

  0%|          | 0/2034 [00:00<?, ?it/s]

In [10]:
generic.head()

Unnamed: 0,text,nouns,tokens
0,"""आवेदन करने की आखिरी तारीख 31 जनवरी, 2020 है।""",[],"[[""आवेदन, करने, की, आखिरी, तारीख, 31, जनवरी,, ..."
1,"""इतनी दुआ कर दो हमारे लिए कि जितना प्यार दुनिय...",['दुनिया'],"[[""इतनी, दुआ, कर, दो, हमारे, लिए, कि, जितना, प..."
2,"""मोदी सरकार के पहले कार्यकाल में भी तीन तलाक क...","['सरकार', 'पास']","[[""मोदी, सरकार, के, पहले, कार्यकाल, में, भी, त..."
3,"""भाजपा के दिवंगत नेता प्रमोद महाजन की बेटी पून...","['नेता', 'सचिव']","[[""भाजपा, के, दिवंगत, नेता, प्रमोद, महाजन, की,..."
4,"""ऐसी स्थिति में एक न्यायपूर्ण सरकार सार्वजनिक ...","['स्थिति', 'सरकार', 'रूप']","[[""ऐसी, स्थिति, में, एक, न्यायपूर्ण, सरकार, सा..."


## Calculate TF-IDF

In [11]:
from collections import Counter

def get_tfidf(sents):
    tfcounter = Counter()
    dfcounter = Counter()
    N = 0
    for idx, row in tqdm(sents.iterrows(), total=len(sents)):
        for s in row["tokens"]:
            N += 1
            tfcounter.update(s)

            uniqueTokens = list(set(s))
            dfcounter.update(uniqueTokens)  

    return N, tfcounter, dfcounter

def save_tfidf(generic,tf_file_name,df_file_name):
    N, tfcounter, dfcounter = get_tfidf(generic)
    dump_jsonl([tfcounter], tf_file_name)
    dump_jsonl([dfcounter], df_file_name)
    
    return N


In [12]:
generic_file[:-4]

'\\chunk_d'

In [13]:
tf_file_name = data_dir + '/tfcounter'+generic_file[:-4]+'jsonl'
df_file_name = data_dir + '/dfcounter'+generic_file[:-4]+'jsonl'

N = save_tfidf(generic,tf_file_name,df_file_name)    

  0%|          | 0/2034 [00:00<?, ?it/s]

Wrote 1 records to D:\rarewordshindi\Textchunks/tfcounter\chunk_djsonl
Wrote 1 records to D:\rarewordshindi\Textchunks/dfcounter\chunk_djsonl


In [14]:
def load_counter(filename):
    counter = load_jsonl(filename)[0]
    counter = pd.DataFrame([{"term":k, "count":counter[k]} for k in counter])
    return counter

def get_tfidfcounter(tf_file_name,df_file_name):
    _tfcounter = load_counter(tf_file_name)
    _dfcounter = load_counter(df_file_name)
    _tfcounter.columns = ["term", "tf"]
    _dfcounter.columns = ["term", "df"]
    tfidfcounter = _tfcounter.merge(_dfcounter, on="term")
    return tfidfcounter

In [15]:
tfidfcounter = get_tfidfcounter(tf_file_name,df_file_name)

Loaded 1 records from D:\rarewordshindi\Textchunks/tfcounter\chunk_djsonl
Loaded 1 records from D:\rarewordshindi\Textchunks/dfcounter\chunk_djsonl


## Select rare words

Select only terms that 
* consider only terms that can be observed at least 5 times in the corpus
* consider only Thai terms
* consider only terms that longer than 5 charecters (remove noise from word tokenization as Thai also doesn't have clear word boundary)

In [18]:
def filter_based_on_length_occuurance(tfidfcounter, minTF=5, minWordLenth = 5):

    tfidfcounter = tfidfcounter[tfidfcounter["tf"] >= minTF]
    tfidfcounter = tfidfcounter[tfidfcounter["term"].apply(lambda x: len(x) > minWordLenth)]
    
    return tfidfcounter
#You need to include your language specif minTF and minimum word length
tfidfcounter = filter_based_on_length_occuurance(tfidfcounter, minTF=5,minWordLenth = 5)

I used logarithmically scaled formular. Please [see](https://jmotif.github.io/sax-vsm_site/morea/algorithm/TFIDF.html) for more detail.

In [19]:
# N = number of documents; in this case, it is number of sentences in the corpus
tfidfcounter["tfidf"] = np.log(1+tfidfcounter["tf"]) * np.log(N/tfidfcounter["df"])

Define a threshold for the rare words and get the set of rare words in rare_word

In [20]:
tfidf = tfidfcounter.sort_values(by="tfidf", ascending=False).reset_index()
print(tfidf["tfidf"].describe())
# ax = tfidf["tfidf"].plot(title="TF-IDF", xlabel="word index")

count    402.000000
mean      12.359367
std        1.326304
min       10.802951
25%       11.202771
50%       12.158329
75%       13.256673
max       17.259000
Name: tfidf, dtype: float64


In [57]:
import pandas as pd

# Assuming tfidfcounter is a DataFrame that contains the TF-IDF scores and terms
# You might have something like this, for example:
# tfidfcounter = pd.DataFrame({'term': ['विद्यार्थी', 'student', 'खेल', 'game'], 'tfidf': [0.1, 0.2, 0.3, 0.4]})

# Use 25% Quantile as threshold for rare words
quantile = 0.45 # You could change this
threshold = tfidfcounter["tfidf"].quantile(quantile)
selected_terms = tfidfcounter[tfidfcounter["tfidf"] < threshold]

# Filter out English terms using a regular expression
selected_terms = selected_terms[~selected_terms['term'].str.contains(r'[a-zA-Z]', regex=True)]

# Print the filtered selected terms that are only in Hindi
print(selected_terms)

             term  tf  df      tfidf
35       कार्यकाल   5   5  10.802951
45        हालांकि   6   6  11.377582
86        प्रबंधन   6   6  11.377582
93        चेयरमैन   5   4  11.202771
158      ऐतिहासिक   5   5  10.802951
...           ...  ..  ..        ...
8759       शर्मा,   5   5  10.802951
8956    कार्यकारी   5   4  11.202771
9420   महाराष्ट्र   6   5  11.732363
9849      नेतृत्व   5   5  10.802951
10063      चुनावी   5   4  11.202771

[163 rows x 4 columns]


In [49]:
selected_terms

Unnamed: 0,term,tf,df,tfidf
35,कार्यकाल,5,5,10.802951
158,ऐतिहासिक,5,5,10.802951
253,रैंकिंग,5,5,10.802951
681,पहुंचने,5,5,10.802951
724,समाधान,5,5,10.802951
...,...,...,...,...
7410,क्षेत्रीय,5,5,10.802951
7596,बेहतरीन,5,5,10.802951
8166,सीनियर,5,5,10.802951
8759,"शर्मा,",5,5,10.802951


In [47]:
def extract_and_remove_nouns(text):
    tokens = text.split()
    train_data = indian.tagged_sents('hindi.pos')
    tnt_pos_tagger = tnt.TnT()
    tnt_pos_tagger.train(train_data)
    tagged_words = tnt_pos_tagger.tag(tokens)
    nouns = {word for word, tag in tagged_words if 'NN' in tag}
    # Removing nouns from the original text
    modified_text = ' '.join([word for word in tokens if word not in nouns])
    return modified_text

# Applying the function to each row in the DataFrame's 'term' column and updating it
df['term'] = df['term'].apply(extract_and_remove_nouns)

# Print the updated DataFrame
print(df)

           term
35     कार्यकाल
158    ऐतिहासिक
253     रैंकिंग
681            
724            
...         ...
7410  क्षेत्रीय
7596    बेहतरीन
8166     सीनियर
8759     शर्मा,
9849           

[73 rows x 1 columns]


In [48]:
df.to_csv('updated_terms.csv', index=False, encoding='utf-8')

## Pre-filtering candidate sentences

In [35]:
target_file = ''
target = pd.read_csv(data_dir+target_file)

In [36]:
target.head()

Unnamed: 0,text
0,"""ਖੇਤੀਬਾੜੀ ਮੰਤਰਾਲਾ ਰਾਜਸਥਾਨ, ਮੱਧ ਪ੍ਰਦੇਸ਼, ਪੰਜਾਬ,..."
1,"""ਅੱਤਵਾਦ ਨੂੰ ਕਿਸੇੇ ਧਰਮ ਨਾਲ ਨਹੀਂ ਜੋੜਿਆ ਜਾ ਸਕਦਾ, ..."
2,"""ਲਿਊ ਨੇ ਕਿਹਾ, ਇਸ ਸਭ ਤੋਂ ਚੀਨ - ਭਾਰਤ ਸੀਮਾ ਪਾਰ ਵਿ..."
3,"""ਉਸ ਮੁਤਾਬਿਕ ਅਗਲੀ ਕਾਰਵਾਈ ਕੀਤੀ ਜਾਵੇਗੀ।"""
4,"""ਕਈ ਸਾਲ ਬੀਤ ਗਏ ਅਤੇ ਸ਼ੇਲਾਹ ਵੱਡਾ ਹੋ ਗਿਆ ।"""


In [37]:
target = add_tokens(target)


100%|██████████| 1636898/1636898 [01:01<00:00, 26716.03it/s]


In [38]:
target.head()

Unnamed: 0,text,tokens
0,"""ਖੇਤੀਬਾੜੀ ਮੰਤਰਾਲਾ ਰਾਜਸਥਾਨ, ਮੱਧ ਪ੍ਰਦੇਸ਼, ਪੰਜਾਬ,...","[[""ਖੇਤੀਬਾੜੀ, ਮੰਤਰਾਲਾ, ਰਾਜਸਥਾਨ,, ਮੱਧ, ਪ੍ਰਦੇਸ਼,,..."
1,"""ਅੱਤਵਾਦ ਨੂੰ ਕਿਸੇੇ ਧਰਮ ਨਾਲ ਨਹੀਂ ਜੋੜਿਆ ਜਾ ਸਕਦਾ, ...","[[""ਅੱਤਵਾਦ, ਨੂੰ, ਕਿਸੇੇ, ਧਰਮ, ਨਾਲ, ਨਹੀਂ, ਜੋੜਿਆ, ..."
2,"""ਲਿਊ ਨੇ ਕਿਹਾ, ਇਸ ਸਭ ਤੋਂ ਚੀਨ - ਭਾਰਤ ਸੀਮਾ ਪਾਰ ਵਿ...","[[""ਲਿਊ, ਨੇ, ਕਿਹਾ,, ਇਸ, ਸਭ, ਤੋਂ, ਚੀਨ, -, ਭਾਰਤ, ..."
3,"""ਉਸ ਮੁਤਾਬਿਕ ਅਗਲੀ ਕਾਰਵਾਈ ਕੀਤੀ ਜਾਵੇਗੀ।""","[[""ਉਸ, ਮੁਤਾਬਿਕ, ਅਗਲੀ, ਕਾਰਵਾਈ, ਕੀਤੀ, ਜਾਵੇਗੀ।""]]"
4,"""ਕਈ ਸਾਲ ਬੀਤ ਗਏ ਅਤੇ ਸ਼ੇਲਾਹ ਵੱਡਾ ਹੋ ਗਿਆ ।""","[[""ਕਈ, ਸਾਲ, ਬੀਤ, ਗਏ, ਅਤੇ, ਸ਼ੇਲਾਹ, ਵੱਡਾ, ਹੋ, ਗਿ..."


Remove all sentence which doesn’t fall under cartain length

In [39]:
def filter_sent_by_length(sents, min_length = 8, max_length = 29):
    filtered_sents = []

    for idx, row in tqdm(sents.iterrows(), total=len(sents)):
        for s in row["tokens"]:
            if len(s) > max_length:
                continue

            if len(s) < min_length:
                continue

            filtered_sents.append({
                "sent": " ".join(s),
                "words": s,
                
                })

    filtered_sents = pd.DataFrame(filtered_sents)
    return filtered_sents

min_length = 8 #You could change this
max_length = 29 #You could change this

#You need to include your language specific minimum and maximum sentence length
selected_targets = filter_sent_by_length(target, min_length = 8, max_length = 29)

100%|██████████| 1636898/1636898 [00:55<00:00, 29641.27it/s]


Select `n` sentences which have rare words

In [40]:
from collections import defaultdict

def select_sents_by_rare_words(selected_terms, target_sents):

    candidateidx = set()
    keywords = defaultdict(list)

    for _, row in selected_terms.iterrows():
        try:
            docs = target_sents[target_sents["sent"].str.contains(row["term"], regex=False)]
            candidateidx.update(docs.index.to_list())
            for i in docs.index.to_list():
                keywords[i].append(row["term"])
        except Exception as e:
            print(row)
            print(e)

    candidateidx = list(candidateidx)
    rare_word_sents = target_sents.loc[candidateidx]
    print("Sents w/ rare words", len(rare_word_sents))

    for i in keywords:
        if i in rare_word_sents.index:
            rare_word_sents.loc[i, "keywords"] = ",".join(keywords[i])

    # Shuffle
    rare_word_sents.sample(frac=1)
    
    non_rare_word_sents = target_sents.loc[~target_sents.index.isin(candidateidx)]
    print("Sents w/o rare words", len(non_rare_word_sents))
    
    return rare_word_sents, non_rare_word_sents

rare_word_sents, non_rare_word_sents = select_sents_by_rare_words(selected_terms, selected_targets)

Sents w/ rare words 163833
Sents w/o rare words 916935


Resample sentences based on categories.



In [79]:
# #You need to change based on your categories name

# categories = [
#  'cat1',
#  'cat2'
# ]

In [41]:
def sample_by_text(sents, nsample = 100):
    # candidates = set()
    # for cat in categories:
    #     _sents = sents[sents["category"].str.contains(cat, regex=False)]
    #     if len(_sents)>0:
    #         _sents = _sents.sample(n=nsample)
    #         candidates.update(_sents.index.to_list())

    # sampled_sents = sents.loc[list(candidates)]
    sampled_sents = sents.sample(n=nsample)
    return sampled_sents

In [42]:
# nsample = number of sentences from each category
nsample = 4000 #You need to change this
sampled_rare_word_sents = sample_by_text(rare_word_sents,nsample)
sampled_rare_word_sents

Unnamed: 0,sent,words,keywords
961229,"""ਪ੍ਰਚਾਰਕ ਹੋਣ ਦੇ ਨਾਤੇ ਸਾਨੂੰ ਵੀ ਇੱਦਾਂ ਹੀ ਕਰਨ ਦੀ ...","[""ਪ੍ਰਚਾਰਕ, ਹੋਣ, ਦੇ, ਨਾਤੇ, ਸਾਨੂੰ, ਵੀ, ਇੱਦਾਂ, ਹੀ...","""ਪ੍ਰਚਾਰਕ"
45308,"""ਫੌਜ ਮੁਖੀ ਵੱਲੋਂ ਸ਼ੌਪੀਆਂ 'ਚ ਸ਼ਹੀਦ ਹੋਏ ਸੈਨਿਕਾਂ ਨੂੰ...","[""ਫੌਜ, ਮੁਖੀ, ਵੱਲੋਂ, ਸ਼ੌਪੀਆਂ, 'ਚ, ਸ਼ਹੀਦ, ਹੋਏ, ਸੈਨ...","ਸ਼ਰਧਾਂਜਲ,ਸ਼ਰਧਾਂਜ"
414396,"""ਪੁਲਿਸ ਦੇ ਦੁਆਰਾ ਇਨ੍ਹਾਂ ਦੋਸ਼ੀਆਂ ਨੂੰ ਗ੍ਰਿਫਤਾਰ ਕਰਕ...","[""ਪੁਲਿਸ, ਦੇ, ਦੁਆਰਾ, ਇਨ੍ਹਾਂ, ਦੋਸ਼ੀਆਂ, ਨੂੰ, ਗ੍ਰਿਫ...",ੱਖ-ਵੱਖ
169751,"""ਪੁਲਿਸ ਅਧਿਕਾਰੀ ਨੇ ਦੱਸਿਆ ਕਿ ਪੋਸਟ–ਮਾਰਟਮ ਤੋਂ ਬਾਅਦ...","[""ਪੁਲਿਸ, ਅਧਿਕਾਰੀ, ਨੇ, ਦੱਸਿਆ, ਕਿ, ਪੋਸਟ–ਮਾਰਟਮ, ਤ...","ਜਾਵੇਗੀ।"""
7474,"""ਉਨ੍ਹਾਂ ਨੇ ਈਟਾਨਗਰ ਦੇ ਆਈਜੀ ਪਾਰਕ ਤੋਂ ਕਈ ਹੋਰ ਵਿਕਾ...","[""ਉਨ੍ਹਾਂ, ਨੇ, ਈਟਾਨਗਰ, ਦੇ, ਆਈਜੀ, ਪਾਰਕ, ਤੋਂ, ਕਈ,...",ਪ੍ਰੋਜੈਕ
...,...,...,...
332183,"""ਸਕੂਲ ਦੇ ਚਾਰ ਹਾਉੂਸਾਂ ਦੇ ਵਿਦਿਆਰਥੀਆਂ ਨੇ ਇਸ 'ਚ ਵੱ...","[""ਸਕੂਲ, ਦੇ, ਚਾਰ, ਹਾਉੂਸਾਂ, ਦੇ, ਵਿਦਿਆਰਥੀਆਂ, ਨੇ, ...",ਹਾਉੂਸਾਂ
528619,"""ਅਪਰਾਧਿਕ ਮਾਣਹਾਨੀ ਮਾਮਲੇ 'ਚ ਦਿੱਲੀ ਹਾਈਕੋਰਟ ਵਲੋਂ ਕ...","[""ਅਪਰਾਧਿਕ, ਮਾਣਹਾਨੀ, ਮਾਮਲੇ, 'ਚ, ਦਿੱਲੀ, ਹਾਈਕੋਰਟ,...",ਕੇਜਰੀਵ
47941,"""’ ਉਨ੍ਹਾਂ ਕਿਹਾ ਕਿ ਸਰਕਾਰ ਨੂੰ ਕੁਝ ਨਹੀਂ ਸੁੱਝ ਰਿਹਾ।""","[""’, ਉਨ੍ਹਾਂ, ਕਿਹਾ, ਕਿ, ਸਰਕਾਰ, ਨੂੰ, ਕੁਝ, ਨਹੀਂ, ...","ਰਿਹਾ।"""
622324,"""ਸੜਕ ਹਾਦਸੇ 'ਚ ਏਅਰ ਫੋਰਸ ਦੇ ਮੁਲਾਜ਼ਮ ਦੀ ਮੌਤ""","[""ਸੜਕ, ਹਾਦਸੇ, 'ਚ, ਏਅਰ, ਫੋਰਸ, ਦੇ, ਮੁਲਾਜ਼ਮ, ਦੀ, ...",ਮੁਲਾਜ਼


In [43]:
rare_word_file = data_dir + '/rare_word_'+ target_file
sampled_rare_word_sents.to_csv(rare_word_file, index=False)

Select `n` sentences which **don't** have rare words

In [44]:
nsample = 4000 #You nee to change this
sampled_non_rare_word_sents = sample_by_text(non_rare_word_sents,nsample)
sampled_non_rare_word_sents

Unnamed: 0,sent,words
308348,"""ਇਸ ਸਬੰਧੀ ਸਾਰੀਆਂ ਤਿਆਰੀਆਂ ਮਕੁੰਮਲ ਕਰ ਲਈਆ ਗਈਆਂ ਹਨ।""","[""ਇਸ, ਸਬੰਧੀ, ਸਾਰੀਆਂ, ਤਿਆਰੀਆਂ, ਮਕੁੰਮਲ, ਕਰ, ਲਈਆ,..."
42297,"""ਇੱਦਾਂ ਕਰ ਕੇ ਅਸੀਂ ਵੀ ਪੌਲੁਸ ਵਾਂਗ ਦਿਖਾਉਂਦੇ ਹਾਂ ਕ...","[""ਇੱਦਾਂ, ਕਰ, ਕੇ, ਅਸੀਂ, ਵੀ, ਪੌਲੁਸ, ਵਾਂਗ, ਦਿਖਾਉਂ..."
407483,"""ਇਸ ਤੋਂ ਇਲਾਵਾ ਰਾਸ਼ਟਰਪਤੀ ਰਾਮਨਾਥ ਕੋਵਿੰਦ, ਕੇਂਦਰੀ ਗ...","[""ਇਸ, ਤੋਂ, ਇਲਾਵਾ, ਰਾਸ਼ਟਰਪਤੀ, ਰਾਮਨਾਥ, ਕੋਵਿੰਦ,, ਕ..."
244385,"""ਪੁਲਿਸ ਨੇ ਦੋਸ਼ੀਆਂ ਤੋਂ ਬਾਇਕ ਅਤੇ ਲੁੱਟ ਖੋਹ ਕੀਤਾ ਗਿ...","[""ਪੁਲਿਸ, ਨੇ, ਦੋਸ਼ੀਆਂ, ਤੋਂ, ਬਾਇਕ, ਅਤੇ, ਲੁੱਟ, ਖੋਹ..."
257666,"""ਉਸ ਦੀ ਤਸਵੀਰ ਸੀਸੀਟੀਵੀ ਕੈਮਰੇ `ਚ ਕੈਦ ਹੋ ਗਈ ਹੈ।""","[""ਉਸ, ਦੀ, ਤਸਵੀਰ, ਸੀਸੀਟੀਵੀ, ਕੈਮਰੇ, `ਚ, ਕੈਦ, ਹੋ,..."
...,...,...
419289,"""ਇਸ ਤੋਂ ਇਲਾਵਾ ਬੀਤੇ ਦਿਨ ਹੀ ਧਾਰਾ 144 ਵੀ ਲਾਗੂ ਹੈ।""","[""ਇਸ, ਤੋਂ, ਇਲਾਵਾ, ਬੀਤੇ, ਦਿਨ, ਹੀ, ਧਾਰਾ, 144, ਵੀ..."
224877,"""ਜੇ ਉਹ ਪੜ੍ਹ ਨਹੀਂ ਸਕਦੇ ਤਾਂ ਉਨ੍ਹਾਂ ਨੂੰ ਪੜ੍ਹਨਾ ਸਿ...","[""ਜੇ, ਉਹ, ਪੜ੍ਹ, ਨਹੀਂ, ਸਕਦੇ, ਤਾਂ, ਉਨ੍ਹਾਂ, ਨੂੰ, ..."
523024,"""ਦੱਖਣੀ ਏਸ਼ੀਆ ਦੀ ਪਹਿਲੀ ਭਾਰਤ-ਨੇਪਾਲ ਪੈਟਰੋਲੀਅਮ ਪਾਈਪ...","[""ਦੱਖਣੀ, ਏਸ਼ੀਆ, ਦੀ, ਪਹਿਲੀ, ਭਾਰਤ-ਨੇਪਾਲ, ਪੈਟਰੋਲੀਅ..."
250738,"""ਚੰਦਨ ਪਾਊਡਰ ਦੀ ਥੋੜ੍ਹੀ ਜਿਹੀ ਮਾਤਰਾ ਲੈ ਕੇ ਇਸ ਵਿੱਚ...","[""ਚੰਦਨ, ਪਾਊਡਰ, ਦੀ, ਥੋੜ੍ਹੀ, ਜਿਹੀ, ਮਾਤਰਾ, ਲੈ, ਕੇ..."


In [45]:
non_rare_word_file = data_dir + '/non_rare_word_'+ target_file
sampled_non_rare_word_sents.to_csv(non_rare_word_file, index=False)