<a href="https://colab.research.google.com/github/gokhanturer/NER_Model_SparkNLP/blob/main/BC4CHEMD_TO_CONLL.ipynb" target="_parent"><img src="https://colab.research.google.com/assets/colab-badge.svg" alt="Open In Colab"/></a>

In [1]:
import pandas as pd
import numpy as np
pd.set_option("max_rows",100)

In [2]:
!wget -q https://raw.githubusercontent.com/cambridgeltl/MTL-Bioinformatics-2016/master/data/BC4CHEMD/train.tsv

!wget -q https://raw.githubusercontent.com/cambridgeltl/MTL-Bioinformatics-2016/master/data/BC4CHEMD/test.tsv

In [6]:
with open("train.tsv") as f:
  train_data = f.read()

In [7]:
with open("test.tsv") as f:
  test_data = f.read()

In [3]:
def data_pre_process(data):

  result = []

  for i in data.split("\n"):
    if i == "": # Each sentence ends ""
      result.append("None\tNone") # We add "None" between each sentences
    else:
      result.append(i)
  
  df = pd.DataFrame(result)

  return df

In [4]:
def conll_generator(data, filename):

  data = data_pre_process(data)

  data.to_csv(filename) # We saved the return value of the data_prepare_function as csv file

  df = pd.read_csv(filename, sep = "\t", usecols = [0,1],names = ['token','entity'])

  # We read csv file , add column name and assigned a NaN value to first row 
  df.iloc[0,:] = np.nan
  
  # After reading the data, we cleaned the unnecessary characters in the columns.
  df["token"] = df["token"].map(lambda x: x.split(",")[1], na_action = "ignore")
  df["token"] = df["token"].map(lambda x: ',' if x == '"' else x, na_action = "ignore")
  df["entity"] = df["entity"].map(lambda x: x.strip('"'), na_action = "ignore")

  # We assigned the value np.nan to the places that are None
  df["token"]= df["token"].map(lambda x: np.nan if x == 'None' else x)
  df["entity"] = df["entity"].map(lambda x: np.nan if x == 'None' else x)

  # We added two new columns with value "NN" excluding NaN values
  df["pos1"] = df["token"].map(lambda x: np.nan if type(x)== float else 'NN')
  df["pos2"] = df["token"].map(lambda x: np.nan if type(x)== float else 'NN')

  df = df[["token","pos1","pos2","entity"]]

  # We changed the column names according to the conll format
  df.columns = ["-DOCSTART-", "-X-", "-X-", "O"]

  return df

In [8]:
data = train_data

filename = "train_data.csv"

train_data = conll_generator(data, filename)

In [9]:
train_data.head(10)

Unnamed: 0,-DOCSTART-,-X-,-X-.1,O
0,,,,
1,DPP6,NN,NN,O
2,as,NN,NN,O
3,a,NN,NN,O
4,candidate,NN,NN,O
5,gene,NN,NN,O
6,for,NN,NN,O
7,neuroleptic,NN,NN,O
8,-,NN,NN,O
9,induced,NN,NN,O


In [12]:
train_data.to_csv("/content/BC4CHEMD_train.conll", index = False, sep = " ")

In [11]:
data = test_data

filename = "test_data.csv"

test_data = conll_generator(data, filename)

In [13]:
test_data.to_csv("/content/BC4CHEMD_test.conll", index = False, sep = " ")