In [1]:
import pandas as pd
!pip install requests pdfplumber

import requests
import io
import pdfplumber
import difflib

Collecting pdfplumber
  Downloading pdfplumber-0.10.3-py3-none-any.whl (48 kB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m49.0/49.0 kB[0m [31m800.9 kB/s[0m eta [36m0:00:00[0m
Collecting pdfminer.six==20221105 (from pdfplumber)
  Downloading pdfminer.six-20221105-py3-none-any.whl (5.6 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m5.6/5.6 MB[0m [31m37.8 MB/s[0m eta [36m0:00:00[0m
Collecting pypdfium2>=4.18.0 (from pdfplumber)
  Downloading pypdfium2-4.25.0-py3-none-manylinux_2_17_x86_64.whl (3.0 MB)
[2K     [90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━[0m [32m3.0/3.0 MB[0m [31m39.8 MB/s[0m eta [36m0:00:00[0m
Installing collected packages: pypdfium2, pdfminer.six, pdfplumber
Successfully installed pdfminer.six-20221105 pdfplumber-0.10.3 pypdfium2-4.25.0


In [2]:
def find_first_french_word(df):
    previous_word = 'a'
    for index, row in df.iterrows():
        if row[0][0] == 'a' and previous_word[0] == 'v':
            return index
        else:
            previous_word = row[0]



In [3]:
pdf_url = 'https://docs.steinhardt.nyu.edu/pdfs/metrocenter/xr1/glossaries/ELA/GlossaryCognatesFrenchUpdated5-5-2014.pdf'
response = requests.get(pdf_url)
response.raise_for_status()

cognates = []

with io.BytesIO(response.content) as open_pdf_file:
    with pdfplumber.open(open_pdf_file) as pdf:
        for page in pdf.pages:
            text = page.extract_text()
            for line in text.split('\n'):
                parts = list(filter(None, line.split(' ')))
                if len(parts) == 4:
                    cognates.append((parts[1], parts[3]))

cognates = pd.DataFrame(cognates, columns = ['1', '2'])
cognates = cognates[cognates.apply(lambda x: x[0][0].lower() == x[1][0].lower(), axis=1)]
first_french = find_first_french_word(cognates)
french_cognates = cognates['1'][first_french-14:].tolist()


In [4]:
# Function to count how many words in 'word_list' are in 'cognates_set'
import ast
def count_cognates(word_list, french_cognates):
    actual_list = ast.literal_eval(word_list)
    i = 0
    for w in actual_list:
      for f in french_cognates:
        if w == f:
          i+=1
    return i

In [5]:
test = pd.read_csv('https://github.com/eperroud/DataScienceProject/raw/08daf0738e55dfca4b0d7046e710c65085edd4e9/data/new_test.csv')
training = pd.read_csv('https://github.com/eperroud/DataScienceProject/raw/08daf0738e55dfca4b0d7046e710c65085edd4e9/data/new_training.csv')

In [6]:
training['cognate_count'] = training['tokens'].apply(lambda x: count_cognates(x, french_cognates))
test['cognate_count'] = test['tokens'].apply(lambda x: count_cognates(x, french_cognates))

In [7]:
def find_cognates(word_list, french_cognates, similarity_threshold=0.90):
    actual_list = ast.literal_eval(word_list)
    i = 0
    for french_word in french_cognates:
        for words in actual_list:
            similarity = difflib.SequenceMatcher(None, french_word, words).ratio()
            if similarity > similarity_threshold:
                i+=1
    return i


In [8]:
training['cognate_count'] = training['tokens'].apply(lambda x: find_cognates(x, french_cognates))


In [9]:
test['cognate_count'] = test['tokens'].apply(lambda x: find_cognates(x, french_cognates))


In [None]:
from google.colab import files
training.to_csv('final_training.csv', index=False)
test.to_csv('final_test.csv', index=False)
files.download('final_training.csv')
files.download('final_test.csv' )

<IPython.core.display.Javascript object>

<IPython.core.display.Javascript object>