# FindArtists Primary Operation Flow

## Open input file

#### Module

In [100]:
import pandas as pd
from find_artists import cache, logger
# from find_artists.utils.log.logger import 

class ArtistFileReader:
    def __init__(self, file_path):
        self.file_path = file_path

    def read_file(self, sample_size=None, random_state=None):
        try:
            if self.file_path.endswith('.csv'):
                data = pd.read_csv(self.file_path)
            elif self.file_path.endswith('.xlsx'):
                data = pd.read_excel(self.file_path)
            else:
                logger.warning("Unsupported file format.")
                return

            if sample_size is not None:
                data = data.sample(n=sample_size, random_state=random_state)

            cache['artists'] = data['artist'].tolist()
        except FileNotFoundError:
            logger.warning("File not found: %s.", self.file_path)

#### Test

In [101]:
reader = ArtistFileReader('/Users/charliemarshall/Documents/GitHub/FindArtists/src/find_artists/data/input/names_of_artists.xlsx')
reader.read_file(sample_size=50, random_state=36)
artists = cache['artists']
print(artists)

['Barbie Bertisch', 'Broken English Club', 'Mella Dee', 'Surgeon', 'The Knife', 'Patrick Topping', 'Stimming', 'Hector Couto', 'Luciano', '&ME', 'Shlømo', 'Slam', 'Justice', 'Answer Code Request', 'Fumiya Tanaka', 'Agoria', 'Tama Sumo', 'DJ BORING', 'Vlada', 'Josey Rebelle', 'Janina', 'Edward', 'Maceo Plex', 'Rick Wade', 'Clara 3000', 'Doc Martin', 'Praslea', 'Carl Cox', 'Ben Klock', 'Karenn', 'Raresh', 'Richie Hawtin', 'Lawrence', 'Martin Buttrich', 'Eric Cloutier', 'Madmotormiquel', 'Lauren Lane', 'Damian Lazarus', 'Benji B', 'Claire Morgan', 'Lazare Hoche', 'Maayan Nidam', 'Francesco Del Garda', 'Yousef', 'Ivan Smagghe', 'Dennis Ferrer', 'Pearson Sound', 'Julia Holter', 'I:Cube', 'Truss']


## Retrieve Data

### Normalise String Data

In [102]:
import re
class NormalizeStrings:
    def __init__(self):
        pass

    def normalize(self, string):
        # Put all letters to lower case
        string = string.lower()

        # Remove Whitespace
        string = string.strip()

        # Remove punctuation
        string = re.sub(r"[^\w\s]", "", string)
        
        return string

### Find Wikipedia Page

#### Module

In [103]:
from collections import namedtuple
from find_artists import cache, logger
from tqdm import tqdm
import pywikibot
from pywikibot.exceptions import NoPageError, InvalidTitleError

class WikiIDFinder:

    def __init__(self, artists):
        self.artists = artists
        self.ArtistID = namedtuple('ArtistID', ['name', 'id'])
        self.site = pywikibot.Site("en", "wikipedia")
        cache['artists_ids'] = []
        self.normalizer = NormalizeStrings()

    def find(self):
        for artist in tqdm(self.artists, unit='artists'):
            #artist = self.normalizer.normalize(artist)
            try:
                page_info = self._get_artist_id(artist)
            except NoPageError:
                logger.warning('No page found for: %s', artist)
                continue
            except InvalidTitleError as err:
                logger.warning(err)
                continue
            artist_id = self._retrieve_id(page_info)
            cache['artists_ids'].append(self.ArtistID(artist, artist_id))

    def _get_artist_id(self, artist):
        try:
            page = pywikibot.Page(self.site, f"{artist}")
        except NoPageError:
            raise
        except InvalidTitleError:
            raise
            
        page_info = pywikibot.ItemPage.fromPage(page)
        return page_info
    
    def _retrieve_id(self, page_info):

        # TURN THE ITEM TO STRING FORMAT
        info_str = str(page_info)

        # SPLIT THE STRING ITEMS ON SEMI-COLON INTO LIST
        info_items = info_str.split(":")

        # SELECT THE SECOND ITEM IN THE LIST
        id_item = info_items[1]

        # REMOVE THE SQUARE BRACKETS FROM LIST
        artist_id = id_item.replace("]", "")

        # RETURN THE CLEANED ID
        return artist_id


#### Test

In [104]:
from find_artists.cache.cache_context_manager import cache_manager

with cache_manager('artists') as artists:
    finder = WikiIDFinder(artists)
    finder.find()

artists_ids = cache['artists_ids']
print(artists_ids)

100%|██████████| 50/50 [00:17<00:00,  2.78artists/s]

[ArtistID(name='Mella Dee', id='Q64746220'), ArtistID(name='Surgeon', id='Q774306'), ArtistID(name='The Knife', id='Q741471'), ArtistID(name='Patrick Topping', id='Q20687412'), ArtistID(name='Stimming', id='Q15780353'), ArtistID(name='Luciano', id='Q21504865'), ArtistID(name='Slam', id='Q424713'), ArtistID(name='Justice', id='Q13189320'), ArtistID(name='Agoria', id='Q2721180'), ArtistID(name='Tama Sumo', id='Q16942587'), ArtistID(name='Vlada', id='Q7938169'), ArtistID(name='Josey Rebelle', id='Q117319833'), ArtistID(name='Edward', id='Q278835'), ArtistID(name='Maceo Plex', id='Q16661959'), ArtistID(name='Rick Wade', id='Q17102010'), ArtistID(name='Doc Martin', id='Q1072839'), ArtistID(name='Carl Cox', id='Q319719'), ArtistID(name='Ben Klock', id='Q816535'), ArtistID(name='Karenn', id='Q21074917'), ArtistID(name='Richie Hawtin', id='Q378876'), ArtistID(name='Lawrence', id='Q219262'), ArtistID(name='Lauren Lane', id='Q434769'), ArtistID(name='Damian Lazarus', id='Q3012699'), ArtistID(nam




### Retrieve Wikidata Claims

#### Module

In [105]:
from collections import namedtuple
from tqdm import tqdm
import pywikibot
class WikiClaimsFetcher:

    def __init__(self, artists_ids):
        self.artists_ids = artists_ids
        cache['wiki_claims'] = []
        self.WikiDataPage = namedtuple('WikiDataPage', ['name', 'id', 'wiki_claims'])
        self.site = pywikibot.Site("wikidata", "wikidata")

    def fetch(self):
        for artist_id in tqdm(self.artists_ids):
            wiki_claims = self._retrieve_wiki_claims_list(artist_id.id)
            cache['wiki_claims'].append(self.WikiDataPage(artist_id.name, artist_id.id, wiki_claims))

    def _retrieve_wiki_claims_list(self, artist_id):
        
        repo = self.site.data_repository()
        item_page = pywikibot.ItemPage(repo, artist_id)
        wikidata = item_page.get()
        wiki_claims = wikidata['claims']
        return wiki_claims

    

#### Test

In [106]:
with cache_manager('artists_ids') as artists_ids:
    fetcher = WikiClaimsFetcher(artists_ids)
    fetcher.fetch()

wiki_claims = cache['wiki_claims']
#print(wiki_claims)

100%|██████████| 32/32 [00:08<00:00,  3.70it/s]


### Parse Wikidata Claims

#### Module

In [107]:
from collections import namedtuple
from tqdm import tqdm
import pywikibot
class ParseWikidataClaims:
    def __init__(self, artists_wiki_claims):
        self.artists_wiki_claims = artists_wiki_claims
        site = pywikibot.Site("wikidata", "wikidata")
        self.repo = site.data_repository()
        cache['parsed_claims'] = []
        self.ParsedClaims = namedtuple('ParsedClaim', ['name', 'id', 'claims'])
        
    def parse(self):
        for artist_wiki_claims in tqdm(self.artists_wiki_claims):
            parsed_claims = self._parse_wiki_claims(artist_wiki_claims.wiki_claims)
            parsed_claims = self.ParsedClaims(artist_wiki_claims.name, 
                            artist_wiki_claims.id, parsed_claims)
            cache['parsed_claims'].append(parsed_claims)


    def _parse_wiki_claims(self, artist_wiki_claims):
        wiki_claims = []
        for claim_list in artist_wiki_claims.values():
            for claim_json in claim_list:
                wiki_claim = pywikibot.Claim.fromJSON(self.repo, claim_json.toJSON())
                wiki_claims.append(wiki_claim)

        return wiki_claims

#### Test

In [108]:
with cache_manager('wiki_claims') as wiki_claims:
    parser = ParseWikidataClaims(wiki_claims)
    parser.parse()

100%|██████████| 32/32 [00:00<00:00, 553.77it/s]


### Extract Wikidata Claims

In [109]:
from collections import namedtuple
from tqdm import tqdm
import pywikibot
class WikidataClaimExtractor:
    def __init__(self, wikidata_claims):
        self.ExtractedData = namedtuple('ExtractedData', ['name', 'id', 'claims'])
        self.wikidata_claims = wikidata_claims

    def extract(self):
        for claims in self.wikidata_claims:
            artist_claims = self._find_claims(claims.claims)
            print(artist_claims)
            
    def _find_claims(self, claims):
        artists_claims = {}
        for claim in claims:
            property_id = claim.getID()
            value = claim.getTarget()
            print(type(value))
            artists_claims[property_id] = value

        return artists_claims

#### Test

In [110]:
with cache_manager('parsed_claims') as parsed_claims:
    extractor = WikidataClaimExtractor(parsed_claims)
    extractor.extract()

<class 'pywikibot.page._wikibase.ItemPage'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'str'>
{'P31': ItemPage('Q5'), 'P21': ItemPage('Q6581097'), 'P106': ItemPage('Q183945'), 'P19': ItemPage('Q58900'), 'P2671': '/g/11bx49hgjy'}
<class 'str'>
<class 'str'>
<class 'str'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'str'>
<class 'pywikibot.page._filepage.FilePage'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'str'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'str'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'str'>
<class 'pywikibot.page._wikibase.ItemPage'>
<class 'pywikibot.page.