## Libraries

In [3]:
import re
import requests
import pandas as pd
from bs4 import BeautifulSoup as BS
import dateparser

## Load data

In [4]:
file_path_input = '(10) Genentech_ Overview _ LinkedIn.html'
with open(file_path_input, "r", encoding='utf-8') as f:
    text = f.read()

## Processing

In [5]:
soup = BS(text, 'html.parser')
post_container = soup.find_all('div', class_="occludable-update ember-view")
len(post_container)

39

In [6]:
class Post:
    def __init__(self, soup):
        self.soup = soup
        
    @property
    def date(self):
        span = self.soup.find_all('span', class_="ember-view")
        if span is not None:
            val = span[0].get_text().split()[0]
            # Translating linkedin shorthand to dateparser shorthand.
            val = re.sub(r'm\b','min', val, 1)
            val = re.sub(r'mo\b','m', val, 1)
            date = dateparser.parse(val)
            date = date.strftime('%Y-%m-%d')
            return date  

    @property
    def content(self):
        span = self.soup.find_all('span', {'aria-hidden': True})
        if span is not None:
            content = span[2].get_text()
            return content       
    
    @property
    def likes(self):
        button = self.soup.find('button', 'feed-shared-social-counts__num-likes')
        if button is not None:
            likes = button.span.get_text().split()[0]
            return likes
        else:
            return 0
  
    @property
    def comments(self):
        button = self.soup.find('button', 'feed-shared-social-counts__num-comments')
        if button is not None:
            comments = button.span.get_text().split()[0]
            return comments
        else:
            return 0
    
    @property
    def article(self):
        article = self.soup.find('article')
        if article is not None:
            data = self.parse_article(article)
            return data

    @staticmethod
    def parse_article(article):
        def get_url(article):
            a = article.find('a')
            if a is not None:
                url = a.get('href')
                return url

        def get_title(article):
            span = article.find('span')
            if span is not None:
                title = span.get_text()
                return title

        def get_subtitle(article):
            h3 = article.find('h3')
            if h3 is not None:
                subtitle = h3.get_text().strip()
                return subtitle

        data = {
            'media': 'article',
            'url': get_url(article),
            'title': get_title(article),
            'subtitle': get_subtitle(article)
        }
        return data

    @property
    def image(self):
        image = self.soup.find('div', 'feed-shared-image__image')
        if image is not None:
            data = self.parse_image(image)
            return data
      
    @staticmethod
    def parse_image(image):
        def get_background_url(image):
            style = image.get('style')
            if style is not None:
                start = 'url("'
                i = style.find(start) + len(start)
                j = style[i:].find('"')
                url = style[i:j]
                return url

        data = {
            'media': 'image',
            'url': get_background_url(image),
        }
        return data
    
    @property
    def video(self):
        video = self.soup.find('iframe')
        if video is not None:
            div_vid = video.parent.parent.parent
            data = self.parse_video(div_vid)
            return data
        
    @staticmethod
    def parse_video(div_vid):
        def get_title(div_vid):
            image = div_vid.find('img')
            if image is not None:
                title = image.get('alt')
                return title
            
        data = {
            'media': 'video',
            'title': get_title(div_vid),
        }
        return data

    def parse(self):
        d = {
            'date': self.date,
            'content': self.content,
            'likes': self.likes,
            'comments': self.comments,
        }
        
        article = self.article
        if article is not None:
            d.update(article)
            
        image = self.image
        if image is not None:
            d.update(image)
            
        video = self.video
        if video is not None:
            d.update(video)
            
        return d

In [10]:
l = []

for post_soup in post_container:
    post = Post(post_soup).parse()
    l.append(post)

In [11]:
df = pd.DataFrame(l)
df = df[['date', 'title', 'content', 'likes', 'comments', 'media', 'subtitle', 'url']]
print(df.shape)
df.head(3)

(39, 8)


Unnamed: 0,date,title,content,likes,comments,media,subtitle,url
0,2018-12-11,"Genentech: Press Releases | Tuesday, Dec 4, 2018",The FDA recently granted Priority Review to ou...,75,1,article,gene.com,https://www.gene.com/media/press-releases/1477...
1,2018-12-11,"Genentech: Press Releases | Wednesday, Dec 5, ...","Today at SABCS, we're sharing positive PhIII r...",47,0,article,gene.com,http://bit.ly/2QzfFbR
2,2018-12-11,Meet Henri Jasper PhD,Finding a place where your creative thinking a...,112,2,video,,


## Output data

In [7]:
file_path_output = './Linkedin Parsed.xlsx'
writer = pd.ExcelWriter(file_path_output, options={'strings_to_urls': False})
df.to_excel(writer, index=False)
writer.close()