In [2]:
import spacy
import numpy as np
import itertools
import lemmatizer as lm
import datetime

In [2]:
%run -i trainLexicon

Negative verbs are added.
Zero infinitive forms of verbs are added.
Consonant softening forms are added.
Dropping vowel forms are added.
Becoming close vowel forms are added.
Transformed lexicon is saved to revisedDict.pkl


In [3]:
nlp_ner = spacy.load('model-best')



In [498]:
doc = nlp_ner("2 gün sonraya gidiş Aralık 12 dönüş kendim ve 3 çocuk Ankara")

colors = {"DURAK": "#F67DE3", "YOLCU": "#7DF6D9", "SAYI":"#a6e22d", "AY":"#FF5733", "GÜN":"#2D14FF"}
options = {"colors": colors} 

spacy.displacy.render(doc, style="ent", options= options, jupyter=True)

In [499]:
class Format:
    
    def format_date_string(self, date_list):
        formatted_date_string = [str(date.strftime("%d-%m-%Y")) for date in date_list]
        return formatted_date_string
    
    def format_weekday(self, weekday_index):
        today = datetime.date.today()
        dates = [(today + datetime.timedelta((index-today.weekday()) % 7)) for index in weekday_index]
        formatted_dates = self.format_date_string(dates)
        return formatted_dates
    
    def format_delay(self, delay):
        today = datetime.date.today()
        dates = [(today + datetime.timedelta(weeks=1)) for d in delay]
        formatted_dates = self.format_date_string(dates)
        return formatted_dates
    
    def format_sdelay(self, sdelay):
        today = datetime.date.today()
        dates = []
        #delay tagleri içerisinde geçen kelimeye göre hafta veya gün sonrasını alma
        for sd in sdelay:
            if 'hafta' in sd[1]:
                dates.append(today + datetime.timedelta(weeks=int(sd[0])))
            if 'gün' in sd[1]:
                dates.append(today + datetime.timedelta(days=int(sd[0])))
                
        date = self.format_date_string(dates)
        return date
    
    def format_datetime(self, date):
        month_list = ['Ocak','Şubat','Mart','Nisan','Mayıs','Haziran','Temmuz','Ağustos','Eylül','Ekim','Kasım','Aralık']
        current_year = datetime.date.today().year
        formatted_date = datetime.datetime(current_year, month_list.index(str(date[1])) + 1, int(date[0]))
        return formatted_date
    
    def format_dates(self, dates):
        
        formatted_dates = [self.format_datetime(date) for date in dates]
        
        if len(formatted_dates)==2 and formatted_dates[0] > formatted_dates[1]: formatted_dates[1] = formatted_dates[1] + datetime.timedelta(days=365)
        
        formatted_dates = self.format_date_string(formatted_dates)
        
        return formatted_dates

In [500]:
class Process(Format):
    
    def __init__(self):
        self.passenger_vocab = {"ADT":["yolcu", "yetişkin", "kişi","kendime","kendim"],
                                "CHD":["çocuk","çocuğum"],
                                "PET":["hayvan"],
                                "YNG":["öğrenci", "genç"],
                                "TCH":["öğretmen", "öğretim görevlisi"],
                                "PRS":['basın', 'muhabir'],
                                "MLT":['asker'],
                                "STF":['personel', 'çalışan'],
                                "60Y":['60 yaş'],
                                "65+":['65 yaş']}
        self.station_vocab = {"Söğütlüçeşme":"SGTC",
                              "Ankara":"ANKR",
                              "İstanbul":"ISTN",
                              "Eskişehir":"ESKR",
                              "Polatlı":"PLTL"}
    
    def process_date_indexes(self, months, numbers):
        
        dates = []
        for month in months:
            for number in numbers:
                if month[1]-1==number[1]:
                    dates.append((number[0],month[0]))
                    numbers.remove(number)
                elif month[1]+1==number[1]:
                    dates.append((number[0],month[0]))
                    numbers.remove(number)
                    
        dates = self.format_dates(dates)
        return dates, numbers

    def process_passenger_indexes(self, passenger_types, numbers):
        
        passengers = []
        for passenger in passenger_types:
            for number in numbers:
                if passenger[1]-1==number[1]:
                    passengers.append((number[0],passenger[0]))
                    numbers.remove(number)
                else:
                    passengers.append((1,passenger[0]))
        return passengers
    
    def process_sdelay_indexes(self, sdelays, numbers):
        
        s_delays = []
        for delay in sdelays:
            for number in numbers:
                if delay[1]-1==number[1]:
                    s_delays.append((number[0],delay[0]))
                    numbers.remove(number)
                    
        s_delays = self.format_sdelay(s_delays)
        return s_delays, numbers
    
    def process_indexes(self,dates, passengers, sdelays, numbers):
        
        if len(dates) != 0:
            dates, numbers = self.process_date_indexes(dates, numbers)
        if len(sdelays) != 0:
            sdelays, numbers = self.process_sdelay_indexes(sdelays, numbers)
        if len(passengers) != 0:
            passengers = self.process_passenger_indexes(passengers, numbers)
        
        return dates, passengers, sdelays
    
    def process_dates(self, dates, weekdays):
        
        if len(weekdays) != 0:
            weekdays = self.format_weekday(weekdays)
        
        if len(dates)==2:
            self.response_content['DepartureDate'] = dates[0]
            self.response_content['ReturnDate'] = dates[1]
        elif len(dates)==1 and len(weekdays)==0:
            self.response_content['DepartureDate'] = dates[0]
        elif len(weekdays)==2:
            self.response_content['DepartureDate'] = weekdays[0]
            self.response_content['ReturnDate'] = weekdays[1] 
        elif len(weekdays)==1:
            self.response_content['DepartureDate'] = weekdays[0]
            if len(dates)==1:
                self.response_content['ReturnDate'] = dates[0]     
        else:
            self.response_content['DepartureDate'] = 'Yarın'
            
    def process_delays(self, delays, sdelays):
        
        if len(delays) != 0:
            delays = self.format_delay(delays)
            
        if len(delays)==2:
            self.response_content['DepartureDate'] = delays[0]
            self.response_content['ReturnDate'] = delays[1]
        elif len(delays)==1:
            if len(sdelays)==1:
                if delays[0] < sdelays[0]:
                    self.response_content['DepartureDate'] = delays[0]
                    self.response_content['ReturnDate'] = sdelays[0]
                else:
                    self.response_content['DepartureDate'] = sdelays[0]
                    self.response_content['ReturnDate'] = delays[0]
            else:
                self.response_content['ReturnDate'] = self.response_content['DepartureDate']
                self.response_content['DepartureDate'] = delays[0]
        elif len(sdelays)==1:
            self.response_content['ReturnDate'] = self.response_content['DepartureDate']
            self.response_content['DepartureDate'] = sdelays[0]
                
    def process_stations(self, stations):
        
        if len(stations) == 2:
            To, From = stations
            self.response_content['From'] = From
            self.response_content['To'] = To
            
        if len(stations) == 1: 
            
            # Assign default_location as the departure location that is retrieved from GPS
            From, To = 'Eskişehir', stations[0]

            self.response_content['From'] = From
            self.response_content['To'] = To
                    
    def process_passengers(self, passengers):
        
        self.response_content['Passengers'] = passengers
    
    def process_self(self, selff):
        
        if len(selff) != 0:
            self.response_content['Passengers'].append((1, selff[0]))
    
    def process_url(self, url_dict):
        link = "/availability?"
        for key, value in url_dict.items():
            #boş olan değerleri alma (ÖR: Dönüş tarihi verilmemişse)
            if len(value) != 0:
                if key == 'From': 
                    from_tag = self.station_vocab[value]
                    link = link+"from0"+"="+from_tag+"&"
                if key == 'To': 
                    to_tag = self.station_vocab[value]
                    link = link+"to0"+"="+to_tag+"&"
                if key == 'DepartureDate': link = link+"date0"+"="+value+"&"
                if key == 'ReturnDate': link = link+"from1"+"="+to_tag+"&"+"to1"+"="+from_tag+"&"+"date1"+"="+value+"&"
                if key == 'Passengers': 
                    #yolcu tiplerinin ve sayılarının bulunduğu dict
                    passenger_types = {}
                    for passenger in value:
                        for tag, types in self.passenger_vocab.items(): 
                            #yolcu tipinin bulunduğu liste
                            if passenger[1] in types:
                                passenger_tag = tag
                                passenger_count = int(passenger[0])
                                #eğer yolcu tagi listede varsa yolcu sayısını üstüne ekle
                                if passenger_tag in passenger_types.keys():
                                    passenger_types[passenger_tag] += passenger_count
                                #yeni bir tagse yeni yolcu ekle
                                else:
                                    passenger_types[passenger_tag] = passenger_count
                    for p, c in passenger_types.items():
                        link = link+p+"="+str(c)+"&"
        #linkteki son & işaretini alma
        link = link[:-1]
        self.response_content['url'] = link

In [502]:
ex = Extract()

In [503]:
ex.extract(doc)

In [504]:
ex.response_content

{'From': 'Eskişehir',
 'To': 'Ankara',
 'DepartureDate': '30-11-2022',
 'ReturnDate': '12-12-2022',
 'Passengers': [('3', 'çocuk'), (1, 'kendim')],
 'url': '/availability?from0=ESKR&to0=ANKR&date0=30-11-2022&from1=ANKR&to1=ESKR&date1=12-12-2022&CHD=3&ADT=1'}

In [505]:
doc

2 gün sonraya gidiş Aralık 12 dönüş kendim ve 3 çocuk Ankara

In [501]:
class Extract(Process):
    """
    Extract information from the received request and return a url as a response
    """
    def __init__(self):
        
        super().__init__()
        self.station_list = ['Ankara', 'İstanbul', 'Eskişehir', 'İzmir', 'Kars', 'Konya', 'Malatya', 'Adana', 'Polatlı', 'Söğütlüçeşme', 'Eryaman', 'Bakırköy', 'Bozüyük', 'Sakarya']
        self.month_list = ['Ocak', 'Şubat', 'Mart', 'Nisan', 'Mayıs', 'Haziran', 'Temmuz', 'Ağustos', 'Eylül', 'Ekim', 'Kasım', 'Aralık']
        self.weekday_list = ['pazartesi', 'salı', 'çarşamba', 'perşembe', 'cuma', 'cumartesi', 'pazar']
        self.selff_list = ['ben', 'bana', 'benim adıma', 'kendim', 'kendime']
        self.s_delay_list = ['güne', 'gün sonraya', 'gün sonrasına', 'gün ilerisine', 'ay', 'hafta', 'ayın', 'haftasına']
        self.gender_list = ['erkek', 'kadın', 'kız', 'hanımefendi', 'beyefendi']
        self.delay_list = ['haftaya', 'günübirlik', 'haftaya yarın', 'ertesi', 'ertesi gün', 'ertesi güne', 'yarın', 'yarına', 'yarın için', 'bugün', 'bugüne', 'haftaya bugün', 'sabaha', 'akşama', 'akşam için', 'sabah için', 'öğleye', 'öğlene', 'öğleden sonraya', 'öğleden sonrasına', 'hafta içi', 'hafta sonu']
        self.passenger_list = ['yolcu', 'engelli', 'arkadaş', 'askeri personel', 'sakat', 'gazi', 'kişilik', 'kişiye', 'şehit yakını', 'gazi yakını', 'hamile', 'çocuklu', 'arkadaşım', 'veli', 'yetişkin', 'çocuğum', 'yaşlı', 'kişi', 'çocuk', 'hayvan', 'öğrenci', 'genç', 'öğretmen', 'öğretim görevlisi', 'basın', 'muhabir', 'asker', 'personel', 'çalışan', '60 yaş', '65 yaş']
        self.number_list = list(np.arange(1,32).astype(str))
        
        self.response_content = {}

        self.response_content['From'] = ""
        self.response_content['To'] = ""
        
        self.response_content['DepartureDate'] = ""
        self.response_content['ReturnDate'] = ""
        
        self.response_content['Passengers'] = []
        
        self.response_content['url'] = ""
        
        self.entity_list = {'DURAK':self.station_list,
                            'AY':self.month_list,
                            'YOLCU':self.passenger_list,
                            'GÜN':self.weekday_list,
                            'SELF':self.selff_list,
                            'DELAY':self.delay_list,
                            'SDELAY':self.s_delay_list,
                            'CİNSİYET':self.gender_list,
                            'SAYI':self.number_list}
        
    def extract_entities(self, doc):
        
        stations = []
        months = []
        passenger_types = []
        weekdays = []
        selff = []
        delays = []
        s_delays = []
        genders = []
        numbers = []
        
        for index, ent in enumerate(doc.ents):
            label = ent.label_
            ent_list = self.entity_list[label]
            
            if label == 'DURAK' or label == 'AY' or label == 'GÜN':
                ent = lm.lemmatizeWord(str(ent))
            if label== 'YOLCU':
                ent = str(ent).split(',')[0]
                
            if str(ent) in ent_list:
                if label == 'DURAK': stations.append(str(ent)) 

                if label == 'AY': months.append((str(ent),index))

                if label == 'YOLCU': passenger_types.append((str(ent),index)) 

                if label == 'GÜN': weekdays.append(str(ent)) 

                if label == 'SELF': selff.append(str(ent)) 
                
                if label == 'DELAY': delays.append(str(ent)) 

                if label == 'SDELAY': s_delays.append((str(ent),index) )
                
                if label == 'CİNSİYET': genders.append((str(ent),index)) 
                
                if label == 'SAYI': numbers.append((str(ent),index))   
                
        dates, passengers, sdelays = self.process_indexes(months, passenger_types, s_delays, numbers)
        
        entities = [stations, dates, weekdays, delays, sdelays, passengers, selff, genders]
        return entities
                
    def extract_stations(self, entities):
        
        stations = entities[0]
        
        self.process_stations(stations)
            
    def extract_dates(self, entities):
        
        dates = entities[1]
        weekdays = [self.weekday_list.index(weekday) for weekday in entities[2]]
        delays = entities[3]
        sdelays = entities[4]
        
        self.process_dates(dates, weekdays)
        self.process_delays(delays, sdelays)
        
    def extract_passengers(self, entities):
        
        passengers = entities[5]
        selff = entities[6]
        genders = entities[7]
        
        self.process_passengers(passengers)
        self.process_self(selff)
        
    def extract(self, doc):
        entities = self.extract_entities(doc)
        self.extract_stations(entities)
        self.extract_dates(entities)
        self.extract_passengers(entities)
        self.process_url(self.response_content)