Итак, требования к итоговой работе по продвинутому Питону.

1. Классы, наследование, структура классов (3 балла)
2. @Красоты + красоты (использование особенностей языка программирования в разных видах) (2 балла)
3. Применение от трех библиотек, которые не ставятся вместе с Питоном (слишком простые могут пойти за 0,5, по номиналу - по баллу за библиотеку, но не больше 3 баллов) (3 балла)
4. Субъективное мнение преподавателя (2 балла).

Мне кажется, что вопросы по системе оценивания лучше задавать сюда, чтобы избежать повторений.

In [1]:
# content
import requests # to retrieve page contents
from bs4 import BeautifulSoup # to parse html tags and retrieve info using them
# requests-like for dynamic websites
from selenium import webdriver 
from selenium.webdriver.common.by import By
from selenium.common.exceptions import TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC

# non in-built
from tqdm import tqdm
import pandas as pd # to handle processed data
from fuzzywuzzy import fuzz # compare strings
from fuzzywuzzy import process # compare strings

# standard
from abc import ABC, abstractmethod # abstract class to use for other parsers 
from typing import Union
import sys
import re
import time

In [656]:
COUNTRIES = ['Bahrain', 'Iraq', 'Jordan', 'Kuwait',
             'Lebanon', 'Oman', 'Pakistan', 'Qatar',
             'Saudi Arabia', 'United Arab Emirates']

pd.set_option('display.max_colwidth', 100)

base class

In [859]:
class BaseUniParser(ABC):
    ''' 
        Basic Class to download information about universities 
        and other insitutions from certain websites.

        target information: country,
                            name,
                            website,
                            funding type,
                            majors taught in the university
    '''
    def __init__(self, url):
        # links to a single university on a database (not this university) website
        # sometimes, no such link is available
        self.url = url
        self.profiles={}
    
    @abstractmethod
    def extractProfiles(self):
        # updates self.profiles
        pass

    @abstractmethod
    def extractData(self) -> list:
        # Retrieves the data about a university
        pass
    
    @abstractmethod
    def getSingleUniData(self, url: Union[str, BeautifulSoup]) -> list:
        # extracts the data about a single university
        pass

a class to parse WHED [website](https://www.whed.net/home.php)

In [721]:
class ParseWHED(BaseUniParser):

    # Use a link to a html file with search results to create an instance
    def __init__(self, url="https://www.whed.net/home.php"):
        self.url = url # path to folder with html files with search results
        self.profiles={}     # links for universities at WHED website
        # firefox driver for selenium
        self.browser = webdriver.Firefox(executable_path = 'geckodriver.exe')

        
    def extractProfiles(self, countries: list):
        """ retrieves links to university profiles """
        self.browser.get(self.url)
        self.__iter__(countries)
        
    
    def extractData(self, countries: list) -> list:
        """
            returns tuples are univeristies (country, name, webiste, funding, available majors)
            later can easily be transformed into pandas DataFrame
        """        
        data = []
        
        for country in countries:
            if country in self.profiles and len(self.profiles[country]) > 0:
                for url in tqdm(self.profiles[country], desc=country):
                    data.append(self.getSingleUniData(url, country))
            else:
                sys.stderr.write(
            "\n%s has no available links, extract profiles first.\n" % (country))
                continue

        return data
    
    def getSingleUniData(self, url_to_whed_profile: str, country: str) -> tuple:
        """ return a tuple with a university information """
        resp = requests.get(url_to_whed_profile)
        soup=BeautifulSoup(resp.text, "html5lib")

        name = self.__getUniName(soup)
        funding = self.__getFunding(soup)
        website = self.__getUniWebsite(soup)
        majors = self.__getUniMajors(soup)
        return  country, name, funding, website, majors
    
    
    def __add__(self, countries: list):
        """ 
            adds more profile links for missing countries
            or rewrites them (plus new ones) 
            if the country has been already parsed
        """
        self.__iter__(countries)
        

    def __repr__(self) -> str:
        """ prints a number of profiles per country """
        if not self.profiles:
            return "No available links"
        else:
            output = ""
            for country, links in self.profiles.items():
                output += "%s has %i links to university profiles.\n" % (country, len(links))

            return output
        
        
    def __getattr__(self, country: str) -> str:
        """ checks how many university profiles a given country has """
        if country not in self.profiles or len(self.profiles[country]) == 0:
            return "%s has no univeristy profiles" % (country)

        return "%s has %i university profiles" % (country, len(self.profiles[country]))
    
        
    def __len__(self) -> int:
        """ return a total number of profiles to parse """
        if not self.profile_links:
            return "No available links"
        else:
            output = 0
            for links in self.profiles.values():
                output += len(links)

            return output
        
                
    def __iter__(self, countries: list):
        """ parse country profile for university links """
        for country in countries:
            if country not in self.profiles:
                self.profiles[country] = []
            self.__item__(country)
            self.__extractLinks(country)
                
        
    def __item__(self, country: str):
        """ updates a dynamic webpage """
        # country profile in a dropdown
        option = "//select[@name='Chp1']/option[text()='%s']" % (country)
        # submit button
        button = "//input[@value='Go']"
        self.browser.find_element_by_xpath(option).click()
        self.browser.find_element_by_xpath(button).click()
        

    def __extractLinks(self, country):
        """
            extracts links to all univeristy profiles in a given coutry 
            from the dynamic page and stores them in self.profile_links
        """
        try:
            # sometimes does not load for no obvious reason
            # after the first successful load, everything works fine
            # tried to give time to load javascript - still no change
            max_entries = "//select[@name='nbr_ref_pge']/option[text()='100']"
            self.browser.find_element_by_xpath(max_entries).click()
            print("%s loaded successfully" % (country))
        except:
            print("%s did not load" % (country))
            return
        
        while True:
            # iterates while there is an active Next button
            soup = BeautifulSoup(self.browser.page_source, "html5lib")
            # find all links
            for profile in soup.find_all("a", class_="fancybox fancybox.iframe"):
                self.profiles[country].append('https://www.whed.net/' + profile.get('href'))
                
            if not soup.find_all('a', class_='next'):
                break
            
            next_button = '//a[@class="next"]'
            self.browser.find_element_by_xpath(next_button).click()
        
        # remove duplicates if any
        self.profiles[country] = list(set(self.profiles[country]))
            
    
    def __getUniName(self, soup) -> str:
        """ retrieve uni name """
        try:
            name = ' '.join(soup.find(class_='detail_right').text.split())
            return re.sub('\s\(.*', "", name)
        except Exception:
            return('unknown UniName')
        

    def __getFunding(self, soup) -> str:
        """ retrieve uni type of funding """
        try:
            funding = " ".join(soup.find( # find necessary soup element
                             'span', class_='dt', string='Institution Funding'
                             ).next_sibling.next_sibling.get_text().split())
            return funding.lower()
        except Exception:
            return('No funding data')


    def __getUniWebsite(self, soup) -> str:
        """ retrieve uni website """
        try:
            link = soup.find(class_="lien").text
            # trim http(s) prefix
            link = re.sub(r'http(s)?://(www\.)?', "", link)
            link = re.sub(r'/.*', "", link)
            return link
        except Exception:
            return('unknown link')

        
    def __getUniMajors(self, soup) -> str:
        """ retrieve majors """
        try:
            elems = soup.find_all('span', class_='libelle', string='Fields of study:')

            majors = []
            for elem in elems:
                majors.extend(elem.next_sibling.text.split(', '))

            return ", ".join(set(majors)).lower()
        except:
            return('unknown majors')

In [722]:
whed = ParseWHED()
whed.extractProfiles(COUNTRIES[:4])

Bahrain did not load
Iraq loaded successfully
Jordan loaded successfully
Kuwait loaded successfully


In [723]:
whed

Bahrain has 0 links to university profiles.
Iraq has 93 links to university profiles.
Jordan has 31 links to university profiles.
Kuwait has 12 links to university profiles.

In [724]:
whed + COUNTRIES[4:]

Lebanon loaded successfully
Oman loaded successfully
Pakistan loaded successfully
Qatar loaded successfully
Saudi Arabia loaded successfully
United Arab Emirates loaded successfully


In [725]:
whed

Bahrain has 0 links to university profiles.
Iraq has 93 links to university profiles.
Jordan has 31 links to university profiles.
Kuwait has 12 links to university profiles.
Lebanon has 39 links to university profiles.
Oman has 52 links to university profiles.
Pakistan has 155 links to university profiles.
Qatar has 4 links to university profiles.
Saudi Arabia has 72 links to university profiles.
United Arab Emirates has 53 links to university profiles.

In [726]:
print(whed.Bahrain)
print(whed.__getattr__('Bahrain'))

Bahrain has no univeristy profiles
Bahrain has no univeristy profiles


In [727]:
len(whed)

511

In [728]:
whed + ['Saudi Arabia', 'United Arab Emirates', 'Bahrain']

Saudi Arabia loaded successfully
United Arab Emirates loaded successfully
Bahrain loaded successfully


In [729]:
whed

Bahrain has 13 links to university profiles.
Iraq has 93 links to university profiles.
Jordan has 31 links to university profiles.
Kuwait has 12 links to university profiles.
Lebanon has 39 links to university profiles.
Oman has 52 links to university profiles.
Pakistan has 155 links to university profiles.
Qatar has 4 links to university profiles.
Saudi Arabia has 72 links to university profiles.
United Arab Emirates has 53 links to university profiles.

In [730]:
data = whed.extractData(COUNTRIES)

Bahrain: 100%|██████████| 13/13 [00:06<00:00,  1.90it/s]
Iraq: 100%|██████████| 93/93 [00:45<00:00,  2.03it/s]
Jordan: 100%|██████████| 31/31 [00:16<00:00,  1.91it/s]
Kuwait: 100%|██████████| 12/12 [00:05<00:00,  2.04it/s]
Lebanon: 100%|██████████| 39/39 [00:19<00:00,  2.03it/s]
Oman: 100%|██████████| 52/52 [00:24<00:00,  2.10it/s]
Pakistan: 100%|██████████| 155/155 [01:15<00:00,  2.04it/s]
Qatar: 100%|██████████| 4/4 [00:01<00:00,  2.06it/s]
Saudi Arabia: 100%|██████████| 72/72 [00:35<00:00,  2.02it/s]
United Arab Emirates: 100%|██████████| 53/53 [00:26<00:00,  1.96it/s]


In [807]:
data = whed.extractData(['Saudi Arabia', 'United Arab Emirates', 'Bahrain'])
# data = whed.extractData(['Bahrain'])

Saudi Arabia:  14%|█▍        | 10/72 [00:06<00:42,  1.46it/s]


KeyboardInterrupt: 

In [808]:
whed_df = pd.DataFrame(data, columns=['country', 'university', 'funding', 'website', 'majors'])

In [809]:
whed_df.shape

(524, 5)

In [810]:
whed_df.to_csv('whed.csv', sep=";")

a class to parse the [Commission for Academic Accreditation](https://www.caa.ae/) of UAE

In [759]:
class ParseUAE(BaseUniParser):

    # Use a link to a html file with search results to create an instance
    def __init__(self, url='https://www.caa.ae/Pages/Institutes/All.aspx'):
        self.url = url # MoE website
        # it stores small soup objects with universities
        self.profiles=[] # soups for universities at WHED website
        # firefox driver for selenium
        self.browser = webdriver.Firefox(executable_path = 'geckodriver.exe')

    def extractProfiles(self):
        """ 
            extracts universitites profiles from MoE website
            and stores them in self.profiles[funding]
        """
        self.browser.get(self.url)
        self.__item__() # update dynamic page
        soup = BeautifulSoup(self.browser.page_source, "html5lib")
        self.profiles.extend(self.__iter__(soup))
        
        
    def __iter__(self, soup):
        """ iterate over the first page """
        # skip the first entry as it is heading
        return soup.find_all('tr')[1:]

            
    def __item__(self):
        """ updates dynamic page content """
        dropdown = "//div[@class='selectize-control form-control form-control-sm single']"
        button = "//div[@class='selectize-dropdown-content']/div[text()='100']"
        time.sleep(5)
        try:
            self.browser.find_element_by_xpath(dropdown).click()
            self.browser.find_element_by_xpath(button).click()
#             time.sleep(5)
        except:
            print('The page %s did not load properly' % (self.browser.current_url))
            
            
    def extractData(self) -> list:
        # rerieves all the data about each university
        return [self.getSingleUniData(uni) for uni in self.profiles]           

    def getSingleUniData(self, soup: BeautifulSoup) -> tuple:
        """ return a turple with a university information"""
        name = soup.td.a.text.lower().title()
        website = self.__getUniWebsite(soup)
        majors = self.__getUniMajors(soup)
        
        return 'United Arab Emirates', name, 'No data', website, majors
                
    def __getUniWebsite(self, soup) -> str:
        """ retrieve uni website """
        website = soup.find('span', class_='vuetable-actions').a.get('href')
        if website == 'javascript:void(0);':
            website = 'no website'
        website = re.sub(r'http(s)?:(/){1,2}(www\.)?', "", website)
        website = re.sub(r'/.*', "", website)
        return website
    
    def __getUniMajors(self, soup) -> str:
        """ retrieve uni website """
        profile = 'https://www.caa.ae/' + soup.td.a.get('href')
        self.browser.get(profile)
        self.__item__() # click necessary data
        soup = BeautifulSoup( # remove arabic chars
            (self.browser.page_source).encode('ascii', 'ignore'),
            "html5lib")
        # skip a few first elems with no relevant info
        try:
            return ", ".join(
                [major.td.a.text.lower() for major in soup.find_all('tr')[7:]])
        except:
            return "No majors found"

    
    def __add__(self, soups: list) -> list:
        """ add more profiles to parse """
        return [self.getSingleUniData(uni) for uni in soups]

In [760]:
uae = ParseUAE()

In [761]:
uae.extractProfiles()
uae.profiles[0:1]

[<tr class="odd" role="row">
                     <td class="align-middle sorting_1" tabindex="0">
                         <a href="/Pages/Institutes/Details.aspx?GUID=137" title="ABU DHABI POLYTECHNIC">ABU DHABI POLYTECHNIC</a></td>
                     <td class="text-right">
                         بوليتيكنيك أبوظبي</td>
                     <td class="d-none align-middle">Abu Dhabi</td>
                     <td class="align-middle text-center"><span class="vuetable-actions">
                         <a data-original-title="https://www.adpoly.ac.ae/En/Pages/Home.aspx" data-placement="top" data-toggle="tooltip" href="https://www.adpoly.ac.ae/En/Pages/Home.aspx" target="_blank" title=""><i class="icon-link2"></i></a></span></td>
                     <td class="d-none">A</td>
                 </tr>]

In [762]:
uae_data = uae.extractData()

The page https://www.caa.ae//Pages/Institutes/Details.aspx?GUID=164 did not load properly


In [187]:
extra = uae.profiles[3:5]
uae_data += (uae + extra)

In [763]:
uae_data[0:1]

[('United Arab Emirates',
  'Abu Dhabi Polytechnic',
  'No data',
  'adpoly.ac.ae',
  'applied bachelor in aircraft maintenance management, applied bachelor in electromechanical engineering technology, applied bachelor in information security engineering technology, applied bachelor in petroleum engineering technology, bachelor of science in meteorology, certificate in aircraft maintenance, diploma in air traffic management, diploma in aircraft engineering technologies, diploma in aircraft maintenance technology, diploma in electromechanical engineering, diploma in meteorology, diploma in oil and gas process engineering, higher diploma in air traffic management, higher diploma in aircraft engineering technology  aeromechanic, higher diploma in aircraft engineering technology  avionic, higher diploma in aircraft maintenance technology  aeromechanics , higher diploma in aircraft maintenance technology  avionics, higher diploma in electromechanical engineering technology, higher diploma i

In [764]:
uae_df = pd.DataFrame(uae_data, columns=['country', 'university', 'funding', 'website', 'majors'])

In [765]:
uae_df.to_csv('uae.csv', sep=";")

a class to parse [MoE](https://www.moe.gov.sa/en) of Saudi Arabia

In [695]:
class ParseSaudiArabia(BaseUniParser):

    # Use a link to a html file with search results to create an instance
    def __init__(self, url='https://www.moe.gov.sa/en/education/highereducation/Pages/'):
        self.url = url # partial path
        # second part of links to university profiles grouped by funding
        # it stores small soup objects with universities
        self.profiles={'UniversitiesList.aspx': [], 'PrivateUniversity.aspx': []}
        # firefox driver for selenium
        self.browser = webdriver.Firefox(executable_path = 'geckodriver.exe')

        
    def extractProfiles(self):
        """ 
            extracts universitites profiles from MoE website
            and stores them in self.profiles[funding]
        """
        for profile in self.profiles.keys():
            # limit the time to load
            self.browser.implicitly_wait(10)
            self.browser.get(self.url + profile)
                
            soup = BeautifulSoup(self.browser.page_source, "html5lib")
            self.profiles[profile].extend(soup.find_all(
                'div',
                class_="blog-info text-right"))
            

    def extractData(self) -> list:
        # rerieves all the data about each university in the html file
        data = []
        
        for funding, profiles in self.profiles.items():
            if funding == 'UniversitiesList.aspx':
                funding = 'public'
            else:
                funding = 'private'
                
            for profile in profiles:
                data.append(self.getSingleUniData(profile, funding))
                
        return data
                

    def getSingleUniData(self, soup: BeautifulSoup, funding) -> tuple:
        """ # return a turple with a university information"""
        
        name = soup.h3.text
        website = self.__getUniWebsite(soup)
        
        return 'Saudi Arabia', name, funding, website, 'no available majors'

    
    def __getUniWebsite(self, soup: BeautifulSoup) -> str:
        website = soup.find("a", string='Website').get('targeturl')
        website = re.sub(r'http(s)?:(/){1,2}/(www\.)?', "", website)
        website = re.sub(r'/.*', "", website)
        return website
    
    def __repr__(self) -> str:
        """ prints a number of profiles per type of funding """
        output = ""
        for funding, profiles in self.profiles.items():
            output += "%s has %i profiles of universities.\n" % (funding, len(profiles))

        return output

In [696]:
sa = ParseSaudiArabia()

In [697]:
sa.extractProfiles()
sa

UniversitiesList.aspx has 29 profiles of universities.
PrivateUniversity.aspx has 14 profiles of universities.

In [698]:
saudi_data = sa.extractData()

In [699]:
print(saudi_data[35:40])

[('Saudi Arabia', 'Alfaisal University', 'private', 'alfaisal.edu', 'no available majors'), ('Saudi Arabia', 'Al Yamamah University', 'private', 'yu.edu.sa', 'no available majors'), ('Saudi Arabia', 'Dar Al Uloom University', 'private', 'dau.edu.sa', 'no available majors'), ('Saudi Arabia', 'Effat University', 'private', 'effatuniversity.edu.sa', 'no available majors'), ('Saudi Arabia', 'Dar Al-Hekma University', 'private', 'dah.edu.sa', 'no available majors')]


In [700]:
saudi_df = pd.DataFrame(saudi_data, columns=['country', 'university', 'funding', 'website', 'majors'])
saudi_df = saudi_df.iloc[:, 0:4]

In [701]:
saudi_df.tail(5)

Unnamed: 0,country,university,funding,website
38,Saudi Arabia,Effat University,private,effatuniversity.edu.sa
39,Saudi Arabia,Dar Al-Hekma University,private,dah.edu.sa
40,Saudi Arabia,Riyadh Elm University,private,home.riyadh.edu.sa
41,Saudi Arabia,Al Maarefa University,private,um.edu.sa
42,Saudi Arabia,Mustaqbal University,private,uom.edu.sa


In [116]:
uae_df = pd.read_csv('uae.csv', sep=";").loc[:, 'country':]
saudi_df = pd.read_csv('saudi.csv', sep=";").loc[:, 'country':]
whed_df = pd.read_csv('whed.csv', sep=";").loc[:, 'country':]

In [117]:
uae_df.head(1)

Unnamed: 0,country,university,funding,website,majors
0,United Arab Emirates,Abu Dhabi Polytechnic,No data,adpoly.ac.ae,applied bachelor in aircraft maintenance manag...


In [118]:
saudi_df.head()

Unnamed: 0,country,university,funding,website
0,Saudi Arabia,Umm Al-Qura University,public,uqu.edu.sa
1,Saudi Arabia,Islamic University of Madinah,public,enweb.iu.edu.sa
2,Saudi Arabia,Imam Muhammad bin Saud Islamic University,public,imamu.edu.sa
3,Saudi Arabia,King Saud University,public,ksu.edu.sa
4,Saudi Arabia,King Abdulaziz University,public,kau.edu.sa


In [119]:
whed_df.shape

(524, 5)

a class to align datasets between whed and other more official databases using fuzzymatching

In [120]:
class FuzzyAligner():
    """ 
        cross validates data between WHED and any other source
        receives two Pandas Dataframes 
        and aligns them using fuzzy matching
    """
    # Use a link to a html file with search results to create an instance
    def __init__(self, df: pd.DataFrame):
        # sort df by country and univeristy
        self.df = df.sort_values(by=['country', 'university']).reset_index().rename(columns={'index': 'old_index'})

        
    def __sub__(self, other: pd.DataFrame) -> pd.DataFrame:
        """ 
            returns the initial dataset reduced to the countries present in a passed one
        """
        countries = list(other.country.unique())
        return self.__getattr__(countries)


    def __eq__(self, other: pd.DataFrame):
        """ 
            compares the initial dataset dataset with the given one
            the given dataset receives two new columns:
            match_by_unversity and match_by_webiste
            The value is the best match and its score
        """
        output = other.copy()
        reduced = self.__sub__(output)
        ids = reduced.old_index.to_list()
        uni2id = {uni.lower(): idx for idx, uni in zip(ids, reduced.university.to_list())}
        web2id = {web: idx for idx, web in zip(ids, reduced.website.to_list())}
        output['match_by_unversity'] = other[['university']].apply(self.fuzzymatching, value2id=uni2id)
        output['match_by_website'] = other[['website']].apply(self.fuzzymatching, value2id=web2id)
        
        return output
                                           
        
    def fuzzymatching(self, values2match, value2id: dict):
        """ finds the best match of the row value"""
        keys = list(value2id.keys())
        best_matches = []
        for value2match in values2match:
            best_match = process.extractOne(value2match.lower(), # to compare
                                            keys, # comparing with
                                            scorer=fuzz.ratio, # comparing function
                                            score_cutoff=90) # bottom score
            
            if best_match != None:
                best_match = value2id[best_match[0]]
                
            best_matches.append(best_match)
        
#         return pd.Series(best_matches)
        
        return pd.Series(best_matches).fillna(0.0).astype(int)


    def __repr__(self):
        """ prints the info per country """
        output = ""
        countries = self < 'country'
        for country in countries:
             output += "%s has %i universities.\n" % (country, self.__getattr__(country).shape[0])

        return output

    def __str__(self):
        """ prints out what countries are in the initial dataset """
        countries = self < 'country'
        countries =  ", ".join(countries)

        return "The dataset contains the information about universities in %s." % (countries)
        
    
    def __len__(self) -> tuple:
        """ prints the number of entries in the initial dataset """
        return self.df.shape[0]
        
    
    def __getattr__(self, countryFilter: Union[str or list]) -> pd.DataFrame:
        """ 
            return the initial dataset filtered by a given option
            {column: value} -> e.g. {'country': 'Saudi Arabia'}
        """
        if isinstance(countryFilter, str):
            return self.df[self.df['country'] == countryFilter]
        elif isinstance(countryFilter, list):
            return self.df[self.df['country'].isin(countryFilter)]    

    def __lt__(self, column: str) -> list:
        """ return a list with unique values in a given """
        return list(self.df[column].unique())
    
    def __gt__(self, column: str) -> list:
        """ return a list of boolean values showing which one is duplicated """
        return list(self.df[column].duplicated(keep=False))

In [121]:
aligner = FuzzyAligner(whed_df)

In [122]:
aligner < 'country'

['Bahrain',
 'Iraq',
 'Jordan',
 'Kuwait',
 'Lebanon',
 'Oman',
 'Pakistan',
 'Qatar',
 'Saudi Arabia',
 'United Arab Emirates']

In [123]:
aligner.df[aligner > 'university']

Unnamed: 0,old_index,country,university,funding,website,majors
211,193,Oman,Modern College of Business and Science,private,mcbs.edu.om,"economics, information management, information..."
212,217,Oman,Modern College of Business and Science,private,mcbs.edu.om,"accountancy, economics, transport management, ..."
289,324,Pakistan,Institute of Management Sciences,public,imsciences.edu.pk,"management, economics, development studies, pu..."
290,393,Pakistan,Institute of Management Sciences,private,pakaims.edu.pk,"management, accountancy, commercial law, softw..."


In [124]:
aligner - uae_df

Unnamed: 0,old_index,country,university,funding,website,majors
471,485,United Arab Emirates,Abu Dhabi Polytechnic,public,adpoly.ac.ae,"meteorology, engineering"
472,497,United Arab Emirates,Abu Dhabi University,private,adu.ac.ae,"electrical engineering, educational administra..."
473,511,United Arab Emirates,Ajman University Ajman University of Science a...,private,ajman.ac.ae,"electrical engineering, water management, soci..."
474,475,United Arab Emirates,Al Ain University Al Ain University of Science...,private,aau.ac.ae,"accountancy, education, pharmacy, law, managem..."
475,509,United Arab Emirates,Al Falah University,private,afu.ac.ae,"arabic, communication studies, law, journalism..."
476,481,United Arab Emirates,Al Ghurair University,private,agu.ac.ae,"public law, educational sciences, finance, bus..."
477,482,United Arab Emirates,Al Qasimia University,private,alqasimia.ac.ae,"economics, arabic, mass communication, arts an..."
478,477,United Arab Emirates,American University in Dubai,private,aud.edu,"electrical engineering, psychology, advertisin..."
479,508,United Arab Emirates,American University in the Emirates,private,aue.ae,"psychology, radio and television broadcasting,..."
480,488,United Arab Emirates,American University of Ras Al Khaimah George M...,public,aurak.ac.ae,"chemical engineering, electrical and electroni..."


In [125]:
str(aligner)

'The dataset contains the information about universities in Bahrain, Iraq, Jordan, Kuwait, Lebanon, Oman, Pakistan, Qatar, Saudi Arabia, United Arab Emirates.'

In [128]:
test_df = aligner == uae_df

In [127]:
uae_df

Unnamed: 0,country,university,funding,website,majors
0,United Arab Emirates,Abu Dhabi Polytechnic,No data,adpoly.ac.ae,applied bachelor in aircraft maintenance manag...
1,United Arab Emirates,Abu Dhabi School Of Management,No data,no website,"bachelor of science in management, master of b..."
2,United Arab Emirates,Abu Dhabi University,No data,adu.ac.ae,"bachelor of architecture, bachelor of arts in ..."
3,United Arab Emirates,Abu Dhabi Vocational Education And Training In...,No data,veti.ac.ae,"certificate 4 in accounting, certificate 4 in ..."
4,United Arab Emirates,Ajman University,No data,ajman.ac.ae,"bachelor in computer science, bachelor of arch..."
...,...,...,...,...,...
69,United Arab Emirates,University Of South Wales Dubai,No data,no website,bachelor of science in aircraft maintenance en...
70,United Arab Emirates,University Of Strathclyde Business School- Uae,No data,www.strath.ac.uk,"master of business administration, master of s..."
71,United Arab Emirates,University Of Wollongong In Dubai,No data,uowdubai.ac.ae,"bachelor of business administration, bachelor ..."
72,United Arab Emirates,Zayed Ii Military College,No data,zmc.mil.ae,higher diploma in military sciences


In [129]:
test_df

Unnamed: 0,country,university,funding,website,majors,match_by_unversity,match_by_website
0,United Arab Emirates,Abu Dhabi Polytechnic,No data,adpoly.ac.ae,applied bachelor in aircraft maintenance manag...,485,485
1,United Arab Emirates,Abu Dhabi School Of Management,No data,no website,"bachelor of science in management, master of b...",0,0
2,United Arab Emirates,Abu Dhabi University,No data,adu.ac.ae,"bachelor of architecture, bachelor of arts in ...",497,497
3,United Arab Emirates,Abu Dhabi Vocational Education And Training In...,No data,veti.ac.ae,"certificate 4 in accounting, certificate 4 in ...",0,0
4,United Arab Emirates,Ajman University,No data,ajman.ac.ae,"bachelor in computer science, bachelor of arch...",0,511
...,...,...,...,...,...,...,...
69,United Arab Emirates,University Of South Wales Dubai,No data,no website,bachelor of science in aircraft maintenance en...,0,0
70,United Arab Emirates,University Of Strathclyde Business School- Uae,No data,www.strath.ac.uk,"master of business administration, master of s...",0,0
71,United Arab Emirates,University Of Wollongong In Dubai,No data,uowdubai.ac.ae,"bachelor of business administration, bachelor ...",512,0
72,United Arab Emirates,Zayed Ii Military College,No data,zmc.mil.ae,higher diploma in military sciences,0,0
