In [95]:
import requests
from bs4 import BeautifulSoup as BSoup
import pandas as pd

In [None]:
class SecFiling(self):
    """Holds the data of a generic SEC filing.
    
    Expected to be subclassed by 'SecFiling10K' and 'SecFiling10Q'.
    """
    def __init__(self, ticker):
        self.ticker = ticker
        
    def download(self):
        """Retrieves a SEC filing from EDGAR."""
        pass
    
    def load(self):
        """Loads the data of SEC filing."""
        pass
    
    def save(self):
        """Saves the data to file."""
        pass
    
    def getEps(self):
        """Retrieve the current EPS from the filing."""
        pass
    
    def getSales(self):
        """Retrieves the current Sales data from the filing."""
        pass
    
    def getRoe(self):
        """Retrieves the current Return on Equity from the filing."""
        pass

In [None]:
class CanslimParams(self):
    def __init__(self, ticker):
        self.ticker = ticker
        
    def loadData(self):
        """Loads the relevant SEC filings for analysis.
        
        Loads the last 4 10-K filings and the last 16(?) 10-Q filings. If necessary, 
        retrieves them from EDGAR and saves the raw files.
        """
        pass
    
    def getEpsQuarter(self, quarter):
        """Returns the EPS for the specified quarter.
        
        The quarter is specified as an integer counting backwards, e.g. 0 (zero) is the current quarter,
        -1 (minus one) is the previous quarter, etc. For readability, the minus sign is required. Only 
        integers between -15 and 0 are allowed.
        """
        pass
    
    def getEpsAnnual(self, year):
        """Returns the EPS for the specified year.
        
        The year is specified as an integer counting backwards, e.g. 0 (zero) is the most recent reported year,
        -1 (minus one) is the previous year, etc. For readability, the minus sign is required. Only 
        integers between -3 and 0 are allowed.
        """
        pass
    
    def getRoeCurrent(self):
        """Returns the most recent Return on Equity."""
        pass
    
    def getSalesQuarter(self, quarter):
        """Returns the Sales for the specified quarter.
        
        The quarter is specified as an integer counting backwards, e.g. 0 (zero) is the current quarter,
        -1 (minus one) is the previous quarter, etc. For readability, the minus sign is required. Only 
        integers between -15 and 0 are allowed.
        """
        pass
    
    def getSalesAnnual(self, year):
        """Returns the Sales for the specified year.
        
        The year is specified as an integer counting backwards, e.g. 0 (zero) is the most recent reported year,
        -1 (minus one) is the previous year, etc. For readability, the minus sign is required. Only 
        integers between -3 and 0 are allowed.
        """
        pass
        
    def getEpsGrowthQuarter(self, q1, q2):
        """Calculates the EPS growth (%) for quarter q1 compared to q2.
        
        The EPS growth is calculated as the ratio EPS(q1)/EPS(q2) * 100%.
        """
        pass
    
    def getEpsGrowthAnnual(self, a1, a2):
        """Calculates the EPS growth (%) for year a1 compared to a2.
        
        The EPS growth is calculated as the ratio EPS(a1)/EPS(a2) * 100%.
        """
        pass
    
    def getStabilityOfEpsGrowth(self, numQuarters):
        """Calculates the stability of the quarterly EPS growth over the last numQuarters.
        
        The stability is calculated as the amount of deviation from the best-fit-line growth. 
        In other words, a line is fitted through the data, and the goodness-of-fit is determined.
        """
        pass
    
    def getEpsGrowthAcceleration(self, numQuarters):
        """Returns the (mean) acceleration of EPS growth over the specified number of quarters.
        
        The acceleration is calculated as the second derivative of the data. numQuarters is required to 
        be between 2 and 15. At least three quarters are necessary to calculate acceleration.
        """
        pass
    
    def getSalesGrowth(self, q1, q1):
        """Calculates the Sales growth (%) for quarter q1 compared to q2.
        
        The Sales growth is calculated as the ratio Sales(q1)/Sales(q2) * 100%.
        """
        pass
    
    def getSalesGrowthAcceleration(self, numQuarters):
        """Returns the (mean) acceleration of Sales growth over the specified number of quarters.
        
        The acceleration is calculated as the second derivative of the data. numQuarters is required to 
        be between 2 and 15. At least three quarters are necessary to calculate acceleration.
        """
        pass
    
    def plotEpsQuarter(self):
        """Generates a log-plot of quarterly EPS data."""
        pass
    
    def plotStockData(self):
        """Generates a plot of the weekly stock data for the last three years."""
        pass
    
    def getStockData(self):
        """Download the weekly stock data for the last three years from somehwere."""
        pass
    
    def getStockGrowth(self):
        """Returns the stock growth as the slope of the best-fit line through the stock data."""
        pass
    
    def getStockAcceleration(self):
        """Fits the equation a*x^2+b*x+c through the data and returns the 'a' coefficient."""
        pass
    
    ## Lofty future goal: write algorithm(s) that identifies Canslim patterns in the stock data
        

# Data to extract from filing:
*  Earnings for current Q
*  Earnings for last 12-20 Q
*  Sales
*  Annual EPS for last 4 years
*  ROE
# Parameters to calculate for each company:
*  Earnings increase: earning for current Q/earnings for same Q prior year
*  Excellency of earnings increase: each of two most recent Q/same Q's prior year + growth rate over last three years
*  Earnings growth acceleration in last 10 Q (calculate 2nd derivative?)
   *  two consecutive Qs of earnings *deceleration* could mean trouble
   *  plot earnings on log-scale for last 12 months to see acceleration
*  Sales increase: 
   *  current Q/prior Q > 25%
   *  *or* Sales growth is accelerating in the last three Q
*  Acceleration of sales growth and earnings growth in last three Q (don't sell if accelerating!)
*  Annual EPS should be increasing in each of the last three years.
*  ROE>17%
*  Stability of Q-EPS over last 3-5 years (plot EPS, fit line through data to determine growth trend, calculate deviation of EPS's from this growth trend (i.e. goodness of fit))

In [1]:
import requests
from bs4 import BeautifulSoup as Bsoup
import pandas as pd

In [2]:
ticker = "NVDA"
## Read in the file
f = open ("1045810_NVIDIA-CORP_10-Q_2017-08-23", "r")
lines = f.readlines ()
f.close ()
ser = pd.Series (lines)
#print (ser)
#this denotes where the actual 10-Q starts. I think the 10-Q part is enclosed by <document> tags
html_idxs = (ser[ser == '<TYPE>10-Q\n'])
#get the line number (== index number):
print ("html_idxs: {}".format(html_idxs.index.values))
#and the corresponding line
print (lines[html_idxs.index.values[0]])
#html_idxs_end = (ser['<\TYPE>' in ser])
#print ("html_idxs_end: {}".format (html_idxs_end.index.values))
#print (lines[html_idxs_end.index.values[0]])

#now I can parse the 10-Q with beautifulsoup:
## Find a way to determine which lines to parse automatically. Really, I only need the html-tag, i.e. lines 55-63 
## (see NVDA_finstat_test.hmtl)
soup = Bsoup ("".join (lines[55:63]), "lxml") #find the end of the section by searching for /Document?
#collect all div-tags for future use
all_divs = soup.find_all ("div")
#find the div-tag that contains 'ITEM 1.&#160;&#160;FINANCIAL STATEMENTS' which indicates the start of the
#income statement's table
#will this work on all filings???

print ("found {} occurrences". format (len(all_divs)))
tables = soup.find_all ("table")
print ("found {} tables".format (len(tables)))
#iterate over the div-tags, not sure if there is a better way. I think everything is is div tags.
flag = False
count = 0
for f in all_divs:
    #find beginning of financial statements
    if (f.get_text().startswith ("ITEM 1.") and "FINANCIAL STATEMENTS" in f.get_text()):
        flag = True
        #print (count)
    if (flag):
        #print (f.get_text())
        if ("three months ended" in f.text.lower()):
            ## At this point, we found the div-tag with the income-statement table
            ## Try to use pandas
            #print((f.find_all('div', text='$')))
            for ff in f.find_all('div',text='$'):
                ff.decompose()
            ## Almost! Just need to get rid of the '$', which are offsetting the affected rows by several columns
            ## This removes the '$' (use re.compile for other currency symbols?), now what about the paren's denoting negative numbers?
            tableDf=pd.read_html(str(f))
            print(tableDf)
            ## Manual method
            trs = f.find_all("tr")
            #print (len(trs))
            #print(trs[2].get_text())
            for t in trs:
                tlist = [item.get_text() for item in t.find_all("td")]
                print(" || ".join(tlist))
            #print (trs)
            break
        #print ("\n")
    count += 1



html_idxs: [48]
<TYPE>10-Q

found 5335 occurrences
found 53 tables
[                                                   0                   1   \
0                                                 NaN  Three Months Ended   
1                                                 NaN            July 30,   
2                                                 NaN                2017   
3                                                 NaN                 NaN   
4                                             Revenue                 NaN   
5                                     Cost of revenue                 928   
6                                        Gross profit                1302   
7                                  Operating expenses                 NaN   
8                            Research and development                 416   
9                   Sales, general and administrative                 198   
10                    Restructuring and other charges                   —   
11      

### See these urls for downloading the appropriate files, and parsing XBRL with Python/BeautifulSoup
*  https://www.codeproject.com/Articles/1227765/Parsing-XBRL-with-Python
*  https://www.slideshare.net/afalk42/xbrl-us-altova-webinar
### Taxonomy:
*  http://www.xbrlsite.com/LinkedData/BrowseObjectsByType_HTML.aspx?Type=%5BConcept%5D&Submit=Submit
*  ~~I think I want the instance xml file for each filing,~~ though that \*.txt file I've been downloading so far contains *everything* (I think). Could use that as well, the benefits are that the name of this file is standardized and I already have the machinery to download it in place, but the trade-off is that it's the biggest file for each filing. 
   *  The instance document can be found in the \*.txt file; it has the `<description>` XBRL INSTANCE DOCUMENT. Inside the description-tag is the `<XBRL>`, which contains the XBRL-data for the instance document. Note that there are several XBRL- tags containing the various XBRL documents (schema, extension, linkbases, ...).  
*  Search for the tags 'us-gaap:\*', they contain the items for the financial statements.
*  Also, Wikipedia has a good reference on the components of XBRL: 
   *  https://en.wikipedia.org/wiki/XBRL
### To do:
-  Figure out which taxonomy items I want/need, and all their permutations
-  Determine the date on each item
   -  Each us-gaap:\* - tag has a context-ref (e.g. "FI2017Q4"). Find the corresponding context (in the instance document, below the us-gaap - tags):
      ```
      <xbrli:context id="FI2017Q4">
		<xbrli:entity>
			<xbrli:identifier scheme="http://www.sec.gov/CIK">0001045810</xbrli:identifier>
		</xbrli:entity>
		<xbrli:period>
			<xbrli:instant>2017-01-29</xbrli:instant>
		</xbrli:period>
	</xbrli:context>
    ```  
### Interesting classifiers:
-  unitref=
